diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index a8c7f2f9..98e38ac8 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -40,7 +40,6 @@ jobs: - name: Test Nopshufb run: go test -tags=nopshufb -short&&go test -tags=nopshufb -short -no-avx512 -no-gfni&&go test -tags=nopshufb -short&&go test -tags=nopshufb -no-avx512 -no-avx2 -no-ssse3 -no-sse2 -short - - name: Test Race env: CGO_ENABLED: 1 @@ -88,6 +87,11 @@ jobs: CGO_ENABLED: 1 run: go test -no-avx512 -short -race . + - name: Test Races, no avx512, no avx2-gfni + env: + CGO_ENABLED: 1 + run: go test -no-avx512 -no-avx2-gfni -short -race . + - name: Test Races, no avx2 env: CGO_ENABLED: 1 diff --git a/_gen/gen.go b/_gen/gen.go index 30b71780..941e8a90 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -36,6 +36,9 @@ var switchDefsX [inputMax][outputMax]string var switchDefs512 [inputMax][outputMax]string var switchDefsX512 [inputMax][outputMax]string +var switchDefsAvx2GFNI [inputMax][outputMax]string +var switchDefsXAvx2GFNI [inputMax][outputMax]string + // Prefetch offsets, set to 0 to disable. // Disabled since they appear to be consistently slower. const prefetchSrc = 0 @@ -64,8 +67,6 @@ func main() { RET() genXor() - const perLoopBits = 6 - const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { @@ -74,13 +75,24 @@ func main() { genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) } genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64", i, j), i, j, false) + genMulAvx2GFNI(fmt.Sprintf("mulAvx2GFNI_%dx%d", i, j), i, j, false) genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64Xor", i, j), i, j, true) + genMulAvx2GFNI(fmt.Sprintf("mulAvx2GFNI_%dx%dXor", i, j), i, j, true) + if pshufb { genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) } } } + + genSwitch() + genGF16() + genGF8() + Generate() +} + +func genSwitch() { name := "../galois_gen_switch_amd64.go" tag := "// +build !nopshufb\n" if !pshufb { @@ -113,9 +125,8 @@ import ( avx2CodeGen = true maxAvx2Inputs = %d maxAvx2Outputs = %d -minAvx2Size = %d -avxSizeMask = maxInt - (minAvx2Size-1) -)`, inputMax, outputMax, perLoop)) +minAvx2Size = 64 +)`, inputMax, outputMax)) if !pshufb { w.WriteString("\n\nfunc galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`)}\n") @@ -126,7 +137,7 @@ avxSizeMask = maxInt - (minAvx2Size-1) w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := stop-start `) @@ -145,7 +156,7 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) `) @@ -168,7 +179,7 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { w.WriteString(` func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) & (maxInt - (64 - 1)) `) @@ -187,7 +198,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) & (maxInt - (64 - 1)) `) @@ -206,9 +217,46 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } `) - genGF16() - genGF8() - Generate() + w.WriteString(` + +func galMulSlicesAvx2GFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & (maxInt - (32 - 1)) + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsAvx2GFNI[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvx2GFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & (maxInt - (32 - 1)) + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsXAvx2GFNI[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} +`) } // VPXOR3way will 3-way xor a and b and dst. @@ -263,7 +311,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { // SWITCH DEFINITION: s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) - s += fmt.Sprintf("\t\t\t\treturn n\n") + s += fmt.Sprintf("\t\t\t\treturn n & (maxInt - %d)\n", perLoop-1) if xor { switchDefsX[inputs-1][outputs-1] = s } else { @@ -521,7 +569,7 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { // SWITCH DEFINITION: //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x) - s += fmt.Sprintf("\t\t\t\treturn n\n") + s += fmt.Sprintf("\t\t\t\treturn n & (maxInt - %d)\n", perLoop-1) if xor { switchDefsX[inputs-1][outputs-1] = s } else { @@ -959,6 +1007,235 @@ func genMulAvx512GFNI(name string, inputs int, outputs int, xor bool) { RET() } +func genMulAvx2GFNI(name string, inputs int, outputs int, xor bool) { + const perLoopBits = 5 + const perLoop = 1 << perLoopBits + + total := inputs * outputs + + doc := []string{ + fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), + } + if !xor { + doc = append(doc, "The output is initialized to 0.") + } + + // Load shuffle masks on every use. + var loadNone bool + // Use registers for destination registers. + var regDst = true + var reloadLength = false + + est := total + outputs + 2 + // When we can't hold all, keep this many in registers. + inReg := 0 + if est > 16 { + loadNone = true + inReg = 16 - outputs - 2 + // We run out of GP registers first, now. + if inputs+outputs > 13 { + regDst = false + } + // Save one register by reloading length. + if inputs+outputs > 12 && regDst { + reloadLength = true + } + } + + TEXT(name, 0, fmt.Sprintf("func(matrix []uint64, in [][]byte, out [][]byte, start, n int)")) + x := "" + if xor { + x = "Xor" + } + // SWITCH DEFINITION: + //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) + s := fmt.Sprintf(" mulAvx2GFNI_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) + s += fmt.Sprintf("\t\t\t\treturn n\n") + if xor { + switchDefsXAvx2GFNI[inputs-1][outputs-1] = s + } else { + switchDefsAvx2GFNI[inputs-1][outputs-1] = s + } + + if loadNone { + Commentf("Loading %d of %d tables to registers", inReg, inputs*outputs) + } else { + // loadNone == false + Comment("Loading all tables to registers") + } + if regDst { + Comment("Destination kept in GP registers") + } else { + Comment("Destination kept on stack") + } + + Doc(doc...) + Pragma("noescape") + Commentf("Full registers estimated %d YMM used", est) + + length := Load(Param("n"), GP64()) + matrixBase := GP64() + addr, err := Param("matrix").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, matrixBase) + SHRQ(U8(perLoopBits), length) + TESTQ(length, length) + JZ(LabelRef(name + "_end")) + + matrix := make([]reg.VecVirtual, total) + + for i := range matrix { + if loadNone && i >= inReg { + break + } + table := YMM() + VBROADCASTSD(Mem{Base: matrixBase, Disp: i * 8}, table) + matrix[i] = table + } + + inPtrs := make([]reg.GPVirtual, inputs) + inSlicePtr := GP64() + addr, err = Param("in").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, inSlicePtr) + for i := range inPtrs { + ptr := GP64() + MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) + inPtrs[i] = ptr + } + // Destination + dst := make([]reg.VecVirtual, outputs) + dstPtr := make([]reg.GPVirtual, outputs) + addr, err = Param("out").Base().Resolve() + if err != nil { + panic(err) + } + outBase := addr.Addr + outSlicePtr := GP64() + MOVQ(addr.Addr, outSlicePtr) + MOVQ(outBase, outSlicePtr) + for i := range dst { + dst[i] = YMM() + if !regDst { + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + dstPtr[i] = ptr + } + + offset := GP64() + addr, err = Param("start").Resolve() + if err != nil { + panic(err) + } + + MOVQ(addr.Addr, offset) + if regDst { + Comment("Add start offset to output") + for _, ptr := range dstPtr { + ADDQ(offset, ptr) + } + } + + Comment("Add start offset to input") + for _, ptr := range inPtrs { + ADDQ(offset, ptr) + } + // Offset no longer needed unless not regdst + + if reloadLength { + Commentf("Reload length to save a register") + length = Load(Param("n"), GP64()) + SHRQ(U8(perLoopBits), length) + } + Label(name + "_loop") + + if xor { + Commentf("Load %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + } + + in := YMM() + look := YMM() + for i := range inPtrs { + Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs) + VMOVDQU(Mem{Base: inPtrs[i]}, in) + if prefetchSrc > 0 { + PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc}) + } + ADDQ(U8(perLoop), inPtrs[i]) + + for j := range dst { + idx := i*outputs + j + if loadNone && idx >= inReg { + tmp := YMM() + if i == 0 && !xor { + VBROADCASTSD(Mem{Base: matrixBase, Disp: i * 8}, tmp) + VGF2P8AFFINEQB(U8(0), tmp, in, dst[j]) + } else { + VBROADCASTSD(Mem{Base: matrixBase, Disp: i * 8}, tmp) + VGF2P8AFFINEQB(U8(0), tmp, in, look) + VXORPD(dst[j], look, dst[j]) + } + } else { + if i == 0 && !xor { + VGF2P8AFFINEQB(U8(0), matrix[i*outputs+j], in, dst[j]) + } else { + VGF2P8AFFINEQB(U8(0), matrix[i*outputs+j], in, look) + VXORPD(dst[j], look, dst[j]) + } + } + } + } + Commentf("Store %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU(dst[i], Mem{Base: dstPtr[i]}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + ADDQ(U8(perLoop), dstPtr[i]) + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + Comment("Prepare for next loop") + if !regDst { + ADDQ(U8(perLoop), offset) + } + DECQ(length) + JNZ(LabelRef(name + "_loop")) + VZEROUPPER() + + Label(name + "_end") + RET() +} + func genXor() { // SSE 2 { diff --git a/galois_amd64.go b/galois_amd64.go index c7ab3663..8099f166 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -225,7 +225,7 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { // Note that these currently require that length is multiple of 64. t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] @@ -380,7 +380,7 @@ func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *option return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] t02 := gf2p811dMulMatrices[log_m02] diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 43184349..237c9ddd 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -33,11 +33,22 @@ func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. // //go:noescape @@ -66,11 +77,22 @@ func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. // //go:noescape @@ -99,11 +121,22 @@ func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. // //go:noescape @@ -126,11 +159,22 @@ func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. // //go:noescape func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. // //go:noescape @@ -148,11 +192,22 @@ func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. // //go:noescape func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. // //go:noescape @@ -170,11 +225,22 @@ func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. // //go:noescape func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. // //go:noescape @@ -192,11 +258,22 @@ func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. // //go:noescape func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. // //go:noescape @@ -214,11 +291,22 @@ func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. // //go:noescape func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. // //go:noescape @@ -236,11 +324,22 @@ func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. // //go:noescape func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. // //go:noescape @@ -258,11 +357,22 @@ func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. // //go:noescape func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape @@ -286,11 +396,22 @@ func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. // //go:noescape @@ -319,11 +440,22 @@ func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. // //go:noescape @@ -352,11 +484,22 @@ func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. // //go:noescape @@ -379,11 +522,22 @@ func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. // //go:noescape func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. // //go:noescape @@ -401,11 +555,22 @@ func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. // //go:noescape func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. // //go:noescape @@ -423,11 +588,22 @@ func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. // //go:noescape func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. // //go:noescape @@ -445,11 +621,22 @@ func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. // //go:noescape func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. // //go:noescape @@ -467,11 +654,22 @@ func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. // //go:noescape func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. // //go:noescape @@ -489,11 +687,22 @@ func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. // //go:noescape func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. // //go:noescape @@ -511,11 +720,22 @@ func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. // //go:noescape func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape @@ -539,11 +759,22 @@ func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. // //go:noescape @@ -572,11 +803,22 @@ func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. // //go:noescape @@ -605,11 +847,22 @@ func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. // //go:noescape @@ -632,11 +885,22 @@ func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. // //go:noescape func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. // //go:noescape @@ -654,11 +918,22 @@ func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. // //go:noescape func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. // //go:noescape @@ -676,11 +951,22 @@ func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. // //go:noescape func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. // //go:noescape @@ -698,11 +984,22 @@ func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. // //go:noescape func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. // //go:noescape @@ -720,11 +1017,22 @@ func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. // //go:noescape func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. // //go:noescape @@ -742,11 +1050,22 @@ func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. // //go:noescape func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. // //go:noescape @@ -764,11 +1083,22 @@ func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. // //go:noescape func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape @@ -792,11 +1122,22 @@ func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. // //go:noescape @@ -825,11 +1166,22 @@ func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. // //go:noescape @@ -858,11 +1210,22 @@ func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. // //go:noescape @@ -885,11 +1248,22 @@ func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. // //go:noescape func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. // //go:noescape @@ -907,11 +1281,22 @@ func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. // //go:noescape func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. // //go:noescape @@ -929,11 +1314,22 @@ func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. // //go:noescape func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. // //go:noescape @@ -951,11 +1347,22 @@ func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. // //go:noescape func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. // //go:noescape @@ -973,11 +1380,22 @@ func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. // //go:noescape func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. // //go:noescape @@ -995,11 +1413,22 @@ func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. // //go:noescape func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. // //go:noescape @@ -1017,11 +1446,22 @@ func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. // //go:noescape func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape @@ -1045,11 +1485,22 @@ func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. // //go:noescape @@ -1078,11 +1529,22 @@ func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. // //go:noescape @@ -1111,11 +1573,22 @@ func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. // //go:noescape @@ -1138,11 +1611,22 @@ func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. // //go:noescape func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. // //go:noescape @@ -1160,11 +1644,22 @@ func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. // //go:noescape func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. // //go:noescape @@ -1182,11 +1677,22 @@ func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. // //go:noescape func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. // //go:noescape @@ -1204,11 +1710,22 @@ func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. // //go:noescape func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. // //go:noescape @@ -1226,11 +1743,22 @@ func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. // //go:noescape func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. // //go:noescape @@ -1248,11 +1776,22 @@ func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. // //go:noescape func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. // //go:noescape @@ -1270,11 +1809,22 @@ func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. // //go:noescape func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape @@ -1298,11 +1848,22 @@ func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. // //go:noescape @@ -1331,11 +1892,22 @@ func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. // //go:noescape @@ -1364,11 +1936,22 @@ func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. // //go:noescape @@ -1391,11 +1974,22 @@ func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. // //go:noescape func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. // //go:noescape @@ -1413,11 +2007,22 @@ func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. // //go:noescape func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. // //go:noescape @@ -1435,11 +2040,22 @@ func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. // //go:noescape func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. // //go:noescape @@ -1457,11 +2073,22 @@ func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. // //go:noescape func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. // //go:noescape @@ -1479,11 +2106,22 @@ func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. // //go:noescape func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. // //go:noescape @@ -1501,11 +2139,22 @@ func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. // //go:noescape func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. // //go:noescape @@ -1523,11 +2172,22 @@ func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. // //go:noescape func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape @@ -1551,11 +2211,22 @@ func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. // //go:noescape @@ -1584,11 +2255,22 @@ func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. // //go:noescape @@ -1617,11 +2299,22 @@ func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. // //go:noescape @@ -1644,11 +2337,22 @@ func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. // //go:noescape func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. // //go:noescape @@ -1666,11 +2370,22 @@ func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. // //go:noescape func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. // //go:noescape @@ -1688,11 +2403,22 @@ func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. // //go:noescape func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. // //go:noescape @@ -1710,11 +2436,22 @@ func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. // //go:noescape func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. // //go:noescape @@ -1732,11 +2469,22 @@ func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. // //go:noescape func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. // //go:noescape @@ -1754,11 +2502,22 @@ func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. // //go:noescape func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. // //go:noescape @@ -1776,11 +2535,22 @@ func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. // //go:noescape func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape @@ -1804,11 +2574,22 @@ func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. // //go:noescape @@ -1837,11 +2618,22 @@ func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. // //go:noescape @@ -1870,11 +2662,22 @@ func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. // //go:noescape @@ -1897,11 +2700,22 @@ func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. // //go:noescape func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. // //go:noescape @@ -1919,11 +2733,22 @@ func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. // //go:noescape func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. // //go:noescape @@ -1941,11 +2766,22 @@ func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. // //go:noescape func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. // //go:noescape @@ -1963,11 +2799,22 @@ func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. // //go:noescape func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. // //go:noescape @@ -1985,11 +2832,22 @@ func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. // //go:noescape func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. // //go:noescape @@ -2007,11 +2865,22 @@ func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. // //go:noescape func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. // //go:noescape @@ -2029,11 +2898,22 @@ func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. // //go:noescape func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape @@ -2057,11 +2937,22 @@ func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. // //go:noescape @@ -2090,11 +2981,22 @@ func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. // //go:noescape @@ -2123,11 +3025,22 @@ func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. // //go:noescape @@ -2150,11 +3063,22 @@ func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. // //go:noescape func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. // //go:noescape @@ -2172,11 +3096,22 @@ func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. // //go:noescape func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. // //go:noescape @@ -2194,11 +3129,22 @@ func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. // //go:noescape func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. // //go:noescape @@ -2216,11 +3162,22 @@ func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. // //go:noescape func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. // //go:noescape @@ -2238,11 +3195,22 @@ func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. // //go:noescape func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. // //go:noescape @@ -2260,11 +3228,22 @@ func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. // //go:noescape func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. // //go:noescape @@ -2282,11 +3261,22 @@ func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. // //go:noescape func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape @@ -2310,11 +3300,22 @@ func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. // //go:noescape @@ -2343,11 +3344,22 @@ func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. // //go:noescape @@ -2376,11 +3388,22 @@ func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. // //go:noescape @@ -2403,11 +3426,22 @@ func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. // //go:noescape func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. // //go:noescape @@ -2425,11 +3459,22 @@ func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. // //go:noescape func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. // //go:noescape @@ -2447,11 +3492,22 @@ func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. // //go:noescape func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. // //go:noescape @@ -2469,11 +3525,22 @@ func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. // //go:noescape func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. // //go:noescape @@ -2491,11 +3558,22 @@ func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. // //go:noescape func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. // //go:noescape @@ -2513,11 +3591,22 @@ func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. // //go:noescape func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. // //go:noescape @@ -2535,11 +3624,22 @@ func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. // //go:noescape func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. // //go:noescape diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index b3d0d998..4e414408 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -264,6 +264,49 @@ mulGFNI_1x1_64_loop: mulGFNI_1x1_64_end: RET +// func mulAvx2GFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvx2GFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x1_loop + VZEROUPPER + +mulAvx2GFNI_1x1_end: + RET + // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 @@ -311,6 +354,53 @@ mulGFNI_1x1_64Xor_loop: mulGFNI_1x1_64Xor_end: RET +// func mulAvx2GFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvx2GFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x1Xor_end: + RET + // func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 @@ -609,6 +699,55 @@ mulGFNI_1x2_64_loop: mulGFNI_1x2_64_end: RET +// func mulAvx2GFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvx2GFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x2_loop + VZEROUPPER + +mulAvx2GFNI_1x2_end: + RET + // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 @@ -664,6 +803,61 @@ mulGFNI_1x2_64Xor_loop: mulGFNI_1x2_64Xor_end: RET +// func mulAvx2GFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvx2GFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x2Xor_end: + RET + // func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 @@ -1015,6 +1209,61 @@ mulGFNI_1x3_64_loop: mulGFNI_1x3_64_end: RET +// func mulAvx2GFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvx2GFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x3_loop + VZEROUPPER + +mulAvx2GFNI_1x3_end: + RET + // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 @@ -1078,6 +1327,69 @@ mulGFNI_1x3_64Xor_loop: mulGFNI_1x3_64Xor_end: RET +// func mulAvx2GFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvx2GFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x3Xor_end: + RET + // func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 @@ -1383,6 +1695,67 @@ mulGFNI_1x4_64_loop: mulGFNI_1x4_64_end: RET +// func mulAvx2GFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvx2GFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x4_loop + VZEROUPPER + +mulAvx2GFNI_1x4_end: + RET + // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 @@ -1454,6 +1827,77 @@ mulGFNI_1x4_64Xor_loop: mulGFNI_1x4_64Xor_end: RET +// func mulAvx2GFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvx2GFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x4Xor_end: + RET + // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 @@ -1690,6 +2134,73 @@ mulGFNI_1x5_64_loop: mulGFNI_1x5_64_end: RET +// func mulAvx2GFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvx2GFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x5_loop + VZEROUPPER + +mulAvx2GFNI_1x5_end: + RET + // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 @@ -1769,6 +2280,85 @@ mulGFNI_1x5_64Xor_loop: mulGFNI_1x5_64Xor_end: RET +// func mulAvx2GFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvx2GFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x5Xor_end: + RET + // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 @@ -2030,6 +2620,79 @@ mulGFNI_1x6_64_loop: mulGFNI_1x6_64_end: RET +// func mulAvx2GFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvx2GFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x6_loop + VZEROUPPER + +mulAvx2GFNI_1x6_end: + RET + // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 @@ -2117,6 +2780,93 @@ mulGFNI_1x6_64Xor_loop: mulGFNI_1x6_64Xor_end: RET +// func mulAvx2GFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvx2GFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x6Xor_end: + RET + // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 @@ -2403,6 +3153,85 @@ mulGFNI_1x7_64_loop: mulGFNI_1x7_64_end: RET +// func mulAvx2GFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvx2GFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x7_loop + VZEROUPPER + +mulAvx2GFNI_1x7_end: + RET + // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 @@ -2498,6 +3327,101 @@ mulGFNI_1x7_64Xor_loop: mulGFNI_1x7_64Xor_end: RET +// func mulAvx2GFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvx2GFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x7Xor_end: + RET + // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 @@ -2809,6 +3733,91 @@ mulGFNI_1x8_64_loop: mulGFNI_1x8_64_end: RET +// func mulAvx2GFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvx2GFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x8_loop + VZEROUPPER + +mulAvx2GFNI_1x8_end: + RET + // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 @@ -2912,6 +3921,109 @@ mulGFNI_1x8_64Xor_loop: mulGFNI_1x8_64Xor_end: RET +// func mulAvx2GFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvx2GFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x8Xor_end: + RET + // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 @@ -3248,6 +4360,97 @@ mulGFNI_1x9_64_loop: mulGFNI_1x9_64_end: RET +// func mulAvx2GFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvx2GFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x9_loop + VZEROUPPER + +mulAvx2GFNI_1x9_end: + RET + // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 @@ -3359,6 +4562,117 @@ mulGFNI_1x9_64Xor_loop: mulGFNI_1x9_64Xor_end: RET +// func mulAvx2GFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvx2GFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x9Xor_end: + RET + // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 @@ -3720,6 +5034,103 @@ mulGFNI_1x10_64_loop: mulGFNI_1x10_64_end: RET +// func mulAvx2GFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvx2GFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x10_loop + VZEROUPPER + +mulAvx2GFNI_1x10_end: + RET + // func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 @@ -3839,6 +5250,125 @@ mulGFNI_1x10_64Xor_loop: mulGFNI_1x10_64Xor_end: RET +// func mulAvx2GFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvx2GFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x10Xor_end: + RET + // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 @@ -4179,6 +5709,58 @@ mulGFNI_2x1_64_loop: mulGFNI_2x1_64_end: RET +// func mulAvx2GFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvx2GFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x1_loop + VZEROUPPER + +mulAvx2GFNI_2x1_end: + RET + // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 @@ -4235,6 +5817,62 @@ mulGFNI_2x1_64Xor_loop: mulGFNI_2x1_64Xor_end: RET +// func mulAvx2GFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvx2GFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x1Xor_end: + RET + // func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 @@ -4628,6 +6266,67 @@ mulGFNI_2x2_64_loop: mulGFNI_2x2_64_end: RET +// func mulAvx2GFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvx2GFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x2_loop + VZEROUPPER + +mulAvx2GFNI_2x2_end: + RET + // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 @@ -4695,6 +6394,73 @@ mulGFNI_2x2_64Xor_loop: mulGFNI_2x2_64Xor_end: RET +// func mulAvx2GFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvx2GFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x2Xor_end: + RET + // func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 @@ -5170,6 +6936,76 @@ mulGFNI_2x3_64_loop: mulGFNI_2x3_64_end: RET +// func mulAvx2GFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvx2GFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x3_loop + VZEROUPPER + +mulAvx2GFNI_2x3_end: + RET + // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 @@ -5248,6 +7084,84 @@ mulGFNI_2x3_64Xor_loop: mulGFNI_2x3_64Xor_end: RET +// func mulAvx2GFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvx2GFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x3Xor_end: + RET + // func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 @@ -5661,6 +7575,85 @@ mulGFNI_2x4_64_loop: mulGFNI_2x4_64_end: RET +// func mulAvx2GFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvx2GFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x4_loop + VZEROUPPER + +mulAvx2GFNI_2x4_end: + RET + // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 @@ -5750,6 +7743,95 @@ mulGFNI_2x4_64Xor_loop: mulGFNI_2x4_64Xor_end: RET +// func mulAvx2GFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvx2GFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x4Xor_end: + RET + // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 @@ -6070,6 +8152,94 @@ mulGFNI_2x5_64_loop: mulGFNI_2x5_64_end: RET +// func mulAvx2GFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvx2GFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x5_loop + VZEROUPPER + +mulAvx2GFNI_2x5_end: + RET + // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 @@ -6170,6 +8340,106 @@ mulGFNI_2x5_64Xor_loop: mulGFNI_2x5_64Xor_end: RET +// func mulAvx2GFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvx2GFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x5Xor_end: + RET + // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 @@ -6528,6 +8798,103 @@ mulGFNI_2x6_64_loop: mulGFNI_2x6_64_end: RET +// func mulAvx2GFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvx2GFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x6_loop + VZEROUPPER + +mulAvx2GFNI_2x6_end: + RET + // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 @@ -6639,6 +9006,117 @@ mulGFNI_2x6_64Xor_loop: mulGFNI_2x6_64Xor_end: RET +// func mulAvx2GFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvx2GFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x6Xor_end: + RET + // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 @@ -7035,6 +9513,112 @@ mulGFNI_2x7_64_loop: mulGFNI_2x7_64_end: RET +// func mulAvx2GFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvx2GFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x7_loop + VZEROUPPER + +mulAvx2GFNI_2x7_end: + RET + // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 @@ -7157,6 +9741,128 @@ mulGFNI_2x7_64Xor_loop: mulGFNI_2x7_64Xor_end: RET +// func mulAvx2GFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvx2GFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x7Xor_end: + RET + // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 @@ -7591,6 +10297,121 @@ mulGFNI_2x8_64_loop: mulGFNI_2x8_64_end: RET +// func mulAvx2GFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvx2GFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x8_loop + VZEROUPPER + +mulAvx2GFNI_2x8_end: + RET + // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 @@ -7724,6 +10545,139 @@ mulGFNI_2x8_64Xor_loop: mulGFNI_2x8_64Xor_end: RET +// func mulAvx2GFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvx2GFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x8Xor_end: + RET + // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 @@ -8196,6 +11150,130 @@ mulGFNI_2x9_64_loop: mulGFNI_2x9_64_end: RET +// func mulAvx2GFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvx2GFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x9_loop + VZEROUPPER + +mulAvx2GFNI_2x9_end: + RET + // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 @@ -8340,6 +11418,150 @@ mulGFNI_2x9_64Xor_loop: mulGFNI_2x9_64Xor_end: RET +// func mulAvx2GFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvx2GFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x9Xor_end: + RET + // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 @@ -8850,6 +12072,139 @@ mulGFNI_2x10_64_loop: mulGFNI_2x10_64_end: RET +// func mulAvx2GFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvx2GFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x10_loop + VZEROUPPER + +mulAvx2GFNI_2x10_end: + RET + // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 @@ -9005,6 +12360,161 @@ mulGFNI_2x10_64Xor_loop: mulGFNI_2x10_64Xor_end: RET +// func mulAvx2GFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvx2GFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x10Xor_end: + RET + // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 @@ -9448,6 +12958,67 @@ mulGFNI_3x1_64_loop: mulGFNI_3x1_64_end: RET +// func mulAvx2GFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvx2GFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x1_loop + VZEROUPPER + +mulAvx2GFNI_3x1_end: + RET + // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 @@ -9513,6 +13084,71 @@ mulGFNI_3x1_64Xor_loop: mulGFNI_3x1_64Xor_end: RET +// func mulAvx2GFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvx2GFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x1Xor_end: + RET + // func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 @@ -10001,6 +13637,79 @@ mulGFNI_3x2_64_loop: mulGFNI_3x2_64_end: RET +// func mulAvx2GFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvx2GFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x2_loop + VZEROUPPER + +mulAvx2GFNI_3x2_end: + RET + // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 @@ -10080,6 +13789,85 @@ mulGFNI_3x2_64Xor_loop: mulGFNI_3x2_64Xor_end: RET +// func mulAvx2GFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvx2GFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x2Xor_end: + RET + // func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 @@ -10679,6 +14467,91 @@ mulGFNI_3x3_64_loop: mulGFNI_3x3_64_end: RET +// func mulAvx2GFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvx2GFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x3_loop + VZEROUPPER + +mulAvx2GFNI_3x3_end: + RET + // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 @@ -10772,6 +14645,99 @@ mulGFNI_3x3_64Xor_loop: mulGFNI_3x3_64Xor_end: RET +// func mulAvx2GFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvx2GFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x3Xor_end: + RET + // func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 @@ -11293,6 +15259,103 @@ mulGFNI_3x4_64_loop: mulGFNI_3x4_64_end: RET +// func mulAvx2GFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvx2GFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x4_loop + VZEROUPPER + +mulAvx2GFNI_3x4_end: + RET + // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 @@ -11400,6 +15463,113 @@ mulGFNI_3x4_64Xor_loop: mulGFNI_3x4_64Xor_end: RET +// func mulAvx2GFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvx2GFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x4Xor_end: + RET + // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 @@ -11804,6 +15974,115 @@ mulGFNI_3x5_64_loop: mulGFNI_3x5_64_end: RET +// func mulAvx2GFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvx2GFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x5_loop + VZEROUPPER + +mulAvx2GFNI_3x5_end: + RET + // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 @@ -11925,6 +16204,127 @@ mulGFNI_3x5_64Xor_loop: mulGFNI_3x5_64Xor_end: RET +// func mulAvx2GFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvx2GFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x5Xor_end: + RET + // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 @@ -12380,6 +16780,127 @@ mulGFNI_3x6_64_loop: mulGFNI_3x6_64_end: RET +// func mulAvx2GFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvx2GFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x6_loop + VZEROUPPER + +mulAvx2GFNI_3x6_end: + RET + // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 @@ -12515,6 +17036,141 @@ mulGFNI_3x6_64Xor_loop: mulGFNI_3x6_64Xor_end: RET +// func mulAvx2GFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvx2GFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x6Xor_end: + RET + // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 @@ -13021,6 +17677,139 @@ mulGFNI_3x7_64_loop: mulGFNI_3x7_64_end: RET +// func mulAvx2GFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvx2GFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x7_loop + VZEROUPPER + +mulAvx2GFNI_3x7_end: + RET + // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 @@ -13170,6 +17959,155 @@ mulGFNI_3x7_64Xor_loop: mulGFNI_3x7_64Xor_end: RET +// func mulAvx2GFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvx2GFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x7Xor_end: + RET + // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 @@ -13725,6 +18663,151 @@ mulGFNI_3x8_64_loop: mulGFNI_3x8_64_end: RET +// func mulAvx2GFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvx2GFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x8_loop + VZEROUPPER + +mulAvx2GFNI_3x8_end: + RET + // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 @@ -13886,6 +18969,169 @@ mulGFNI_3x8_64Xor_loop: mulGFNI_3x8_64Xor_end: RET +// func mulAvx2GFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvx2GFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x8Xor_end: + RET + // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 @@ -14488,6 +19734,163 @@ mulGFNI_3x9_64_loop: mulGFNI_3x9_64_end: RET +// func mulAvx2GFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvx2GFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x9_loop + VZEROUPPER + +mulAvx2GFNI_3x9_end: + RET + // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 @@ -14659,6 +20062,183 @@ mulGFNI_3x9_64Xor_loop: mulGFNI_3x9_64Xor_end: RET +// func mulAvx2GFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvx2GFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x9Xor_end: + RET + // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 @@ -15314,6 +20894,179 @@ mulGFNI_3x10_64_loop: mulGFNI_3x10_64_end: RET +// func mulAvx2GFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_3x10_loop + VZEROUPPER + +mulAvx2GFNI_3x10_end: + RET + // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 @@ -15499,6 +21252,201 @@ mulGFNI_3x10_64Xor_loop: mulGFNI_3x10_64Xor_end: RET +// func mulAvx2GFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_3x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x10Xor_end: + RET + // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 @@ -16047,6 +21995,76 @@ mulGFNI_4x1_64_loop: mulGFNI_4x1_64_end: RET +// func mulAvx2GFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvx2GFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x1_loop + VZEROUPPER + +mulAvx2GFNI_4x1_end: + RET + // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 @@ -16121,6 +22139,80 @@ mulGFNI_4x1_64Xor_loop: mulGFNI_4x1_64Xor_end: RET +// func mulAvx2GFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvx2GFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x1Xor_end: + RET + // func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 @@ -16704,6 +22796,91 @@ mulGFNI_4x2_64_loop: mulGFNI_4x2_64_end: RET +// func mulAvx2GFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvx2GFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x2_loop + VZEROUPPER + +mulAvx2GFNI_4x2_end: + RET + // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 @@ -16795,6 +22972,97 @@ mulGFNI_4x2_64Xor_loop: mulGFNI_4x2_64Xor_end: RET +// func mulAvx2GFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvx2GFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x2Xor_end: + RET + // func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 @@ -17518,6 +23786,106 @@ mulGFNI_4x3_64_loop: mulGFNI_4x3_64_end: RET +// func mulAvx2GFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvx2GFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x3_loop + VZEROUPPER + +mulAvx2GFNI_4x3_end: + RET + // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 @@ -17626,6 +23994,114 @@ mulGFNI_4x3_64Xor_loop: mulGFNI_4x3_64Xor_end: RET +// func mulAvx2GFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvx2GFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x3Xor_end: + RET + // func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 @@ -18255,6 +24731,121 @@ mulGFNI_4x4_64_loop: mulGFNI_4x4_64_end: RET +// func mulAvx2GFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvx2GFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x4_loop + VZEROUPPER + +mulAvx2GFNI_4x4_end: + RET + // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 @@ -18380,6 +24971,131 @@ mulGFNI_4x4_64Xor_loop: mulGFNI_4x4_64Xor_end: RET +// func mulAvx2GFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvx2GFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x4Xor_end: + RET + // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 @@ -18868,6 +25584,136 @@ mulGFNI_4x5_64_loop: mulGFNI_4x5_64_end: RET +// func mulAvx2GFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvx2GFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x5_loop + VZEROUPPER + +mulAvx2GFNI_4x5_end: + RET + // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 @@ -19010,6 +25856,148 @@ mulGFNI_4x5_64Xor_loop: mulGFNI_4x5_64Xor_end: RET +// func mulAvx2GFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvx2GFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x5Xor_end: + RET + // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 @@ -19562,6 +26550,151 @@ mulGFNI_4x6_64_loop: mulGFNI_4x6_64_end: RET +// func mulAvx2GFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvx2GFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x6_loop + VZEROUPPER + +mulAvx2GFNI_4x6_end: + RET + // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 @@ -19721,6 +26854,165 @@ mulGFNI_4x6_64Xor_loop: mulGFNI_4x6_64Xor_end: RET +// func mulAvx2GFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvx2GFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x6Xor_end: + RET + // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 @@ -20332,6 +27624,166 @@ mulGFNI_4x7_64_loop: mulGFNI_4x7_64_end: RET +// func mulAvx2GFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvx2GFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x7_loop + VZEROUPPER + +mulAvx2GFNI_4x7_end: + RET + // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 @@ -20503,6 +27955,182 @@ mulGFNI_4x7_64Xor_loop: mulGFNI_4x7_64Xor_end: RET +// func mulAvx2GFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvx2GFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x7Xor_end: + RET + // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 @@ -21173,6 +28801,181 @@ mulGFNI_4x8_64_loop: mulGFNI_4x8_64_end: RET +// func mulAvx2GFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvx2GFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x8_loop + VZEROUPPER + +mulAvx2GFNI_4x8_end: + RET + // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 @@ -21356,6 +29159,199 @@ mulGFNI_4x8_64Xor_loop: mulGFNI_4x8_64Xor_end: RET +// func mulAvx2GFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvx2GFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x8Xor_end: + RET + // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 @@ -22091,6 +30087,200 @@ mulGFNI_4x9_64_loop: mulGFNI_4x9_64_end: RET +// func mulAvx2GFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_4x9_loop + VZEROUPPER + +mulAvx2GFNI_4x9_end: + RET + // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 @@ -22290,6 +30480,220 @@ mulGFNI_4x9_64Xor_loop: mulGFNI_4x9_64Xor_end: RET +// func mulAvx2GFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_4x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x9Xor_end: + RET + // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 @@ -23038,6 +31442,190 @@ mulGFNI_4x10_64_loop: mulGFNI_4x10_64_end: RET +// func mulAvx2GFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvx2GFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvx2GFNI_4x10_loop + VZEROUPPER + +mulAvx2GFNI_4x10_end: + RET + // func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 @@ -23234,6 +31822,222 @@ mulGFNI_4x10_64Xor_loop: mulGFNI_4x10_64Xor_end: RET +// func mulAvx2GFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvx2GFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvx2GFNI_4x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x10Xor_end: + RET + // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 @@ -23872,6 +32676,85 @@ mulGFNI_5x1_64_loop: mulGFNI_5x1_64_end: RET +// func mulAvx2GFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvx2GFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x1_loop + VZEROUPPER + +mulAvx2GFNI_5x1_end: + RET + // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 @@ -23955,6 +32838,89 @@ mulGFNI_5x1_64Xor_loop: mulGFNI_5x1_64Xor_end: RET +// func mulAvx2GFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvx2GFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x1Xor_end: + RET + // func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 @@ -24633,6 +33599,103 @@ mulGFNI_5x2_64_loop: mulGFNI_5x2_64_end: RET +// func mulAvx2GFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvx2GFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x2_loop + VZEROUPPER + +mulAvx2GFNI_5x2_end: + RET + // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 @@ -24736,6 +33799,109 @@ mulGFNI_5x2_64Xor_loop: mulGFNI_5x2_64Xor_end: RET +// func mulAvx2GFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvx2GFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x2Xor_end: + RET + // func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 @@ -25583,6 +34749,121 @@ mulGFNI_5x3_64_loop: mulGFNI_5x3_64_end: RET +// func mulAvx2GFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvx2GFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x3_loop + VZEROUPPER + +mulAvx2GFNI_5x3_end: + RET + // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 @@ -25706,6 +34987,129 @@ mulGFNI_5x3_64Xor_loop: mulGFNI_5x3_64Xor_end: RET +// func mulAvx2GFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvx2GFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x3Xor_end: + RET + // func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 @@ -26443,6 +35847,139 @@ mulGFNI_5x4_64_loop: mulGFNI_5x4_64_end: RET +// func mulAvx2GFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvx2GFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x4_loop + VZEROUPPER + +mulAvx2GFNI_5x4_end: + RET + // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 @@ -26586,6 +36123,149 @@ mulGFNI_5x4_64Xor_loop: mulGFNI_5x4_64Xor_end: RET +// func mulAvx2GFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvx2GFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x4Xor_end: + RET + // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 @@ -27158,6 +36838,157 @@ mulGFNI_5x5_64_loop: mulGFNI_5x5_64_end: RET +// func mulAvx2GFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvx2GFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x5_loop + VZEROUPPER + +mulAvx2GFNI_5x5_end: + RET + // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 @@ -27321,6 +37152,169 @@ mulGFNI_5x5_64Xor_loop: mulGFNI_5x5_64Xor_end: RET +// func mulAvx2GFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvx2GFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x5Xor_end: + RET + // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 @@ -27964,6 +37958,175 @@ mulGFNI_5x6_64_loop: mulGFNI_5x6_64_end: RET +// func mulAvx2GFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvx2GFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x6_loop + VZEROUPPER + +mulAvx2GFNI_5x6_end: + RET + // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 @@ -28141,6 +38304,189 @@ mulGFNI_5x6_64Xor_loop: mulGFNI_5x6_64Xor_end: RET +// func mulAvx2GFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvx2GFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x6Xor_end: + RET + // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 @@ -28855,6 +39201,193 @@ mulGFNI_5x7_64_loop: mulGFNI_5x7_64_end: RET +// func mulAvx2GFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvx2GFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x7_loop + VZEROUPPER + +mulAvx2GFNI_5x7_end: + RET + // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 @@ -29046,6 +39579,209 @@ mulGFNI_5x7_64Xor_loop: mulGFNI_5x7_64Xor_end: RET +// func mulAvx2GFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvx2GFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x7Xor_end: + RET + // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 @@ -29837,6 +40573,215 @@ mulGFNI_5x8_64_loop: mulGFNI_5x8_64_end: RET +// func mulAvx2GFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_5x8_loop + VZEROUPPER + +mulAvx2GFNI_5x8_end: + RET + // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 @@ -30046,6 +40991,233 @@ mulGFNI_5x8_64Xor_loop: mulGFNI_5x8_64Xor_end: RET +// func mulAvx2GFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_5x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x8Xor_end: + RET + // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 @@ -30866,6 +42038,210 @@ mulGFNI_5x9_64_loop: mulGFNI_5x9_64_end: RET +// func mulAvx2GFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x9_loop + VZEROUPPER + +mulAvx2GFNI_5x9_end: + RET + // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 @@ -31075,6 +42451,239 @@ mulGFNI_5x9_64Xor_loop: mulGFNI_5x9_64Xor_end: RET +// func mulAvx2GFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x9Xor_end: + RET + // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 @@ -31950,6 +43559,226 @@ mulGFNI_5x10_64_loop: mulGFNI_5x10_64_end: RET +// func mulAvx2GFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x10_loop + VZEROUPPER + +mulAvx2GFNI_5x10_end: + RET + // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 @@ -32172,6 +44001,258 @@ mulGFNI_5x10_64Xor_loop: mulGFNI_5x10_64Xor_end: RET +// func mulAvx2GFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x10Xor_end: + RET + // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 @@ -32913,6 +44994,94 @@ mulGFNI_6x1_64_loop: mulGFNI_6x1_64_end: RET +// func mulAvx2GFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvx2GFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x1_loop + VZEROUPPER + +mulAvx2GFNI_6x1_end: + RET + // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 @@ -33005,6 +45174,98 @@ mulGFNI_6x1_64Xor_loop: mulGFNI_6x1_64Xor_end: RET +// func mulAvx2GFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvx2GFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x1Xor_end: + RET + // func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 @@ -33778,6 +46039,115 @@ mulGFNI_6x2_64_loop: mulGFNI_6x2_64_end: RET +// func mulAvx2GFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvx2GFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x2_loop + VZEROUPPER + +mulAvx2GFNI_6x2_end: + RET + // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 @@ -33893,6 +46263,121 @@ mulGFNI_6x2_64Xor_loop: mulGFNI_6x2_64Xor_end: RET +// func mulAvx2GFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvx2GFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x2Xor_end: + RET + // func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 @@ -34864,6 +47349,136 @@ mulGFNI_6x3_64_loop: mulGFNI_6x3_64_end: RET +// func mulAvx2GFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvx2GFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x3_loop + VZEROUPPER + +mulAvx2GFNI_6x3_end: + RET + // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 @@ -35002,6 +47617,144 @@ mulGFNI_6x3_64Xor_loop: mulGFNI_6x3_64Xor_end: RET +// func mulAvx2GFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvx2GFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x3Xor_end: + RET + // func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 @@ -35847,6 +48600,157 @@ mulGFNI_6x4_64_loop: mulGFNI_6x4_64_end: RET +// func mulAvx2GFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvx2GFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x4_loop + VZEROUPPER + +mulAvx2GFNI_6x4_end: + RET + // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 @@ -36008,6 +48912,167 @@ mulGFNI_6x4_64Xor_loop: mulGFNI_6x4_64Xor_end: RET +// func mulAvx2GFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvx2GFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x4Xor_end: + RET + // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 @@ -36659,6 +49724,178 @@ mulGFNI_6x5_64_loop: mulGFNI_6x5_64_end: RET +// func mulAvx2GFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvx2GFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x5_loop + VZEROUPPER + +mulAvx2GFNI_6x5_end: + RET + // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 @@ -36838,6 +50075,190 @@ mulGFNI_6x5_64Xor_loop: mulGFNI_6x5_64Xor_end: RET +// func mulAvx2GFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvx2GFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x5Xor_end: + RET + // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 @@ -37572,6 +50993,199 @@ mulGFNI_6x6_64_loop: mulGFNI_6x6_64_end: RET +// func mulAvx2GFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvx2GFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x6_loop + VZEROUPPER + +mulAvx2GFNI_6x6_end: + RET + // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 @@ -37767,6 +51381,213 @@ mulGFNI_6x6_64Xor_loop: mulGFNI_6x6_64Xor_end: RET +// func mulAvx2GFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvx2GFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x6Xor_end: + RET + // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 @@ -38590,6 +52411,224 @@ mulGFNI_6x7_64_loop: mulGFNI_6x7_64_end: RET +// func mulAvx2GFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_6x7_loop + VZEROUPPER + +mulAvx2GFNI_6x7_end: + RET + // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 @@ -38805,6 +52844,240 @@ mulGFNI_6x7_64Xor_loop: mulGFNI_6x7_64Xor_end: RET +// func mulAvx2GFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_6x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x7Xor_end: + RET + // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 @@ -39673,6 +53946,224 @@ mulGFNI_6x8_64_loop: mulGFNI_6x8_64_end: RET +// func mulAvx2GFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x8_loop + VZEROUPPER + +mulAvx2GFNI_6x8_end: + RET + // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 @@ -39891,6 +54382,250 @@ mulGFNI_6x8_64Xor_loop: mulGFNI_6x8_64Xor_end: RET +// func mulAvx2GFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x8Xor_end: + RET + // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 @@ -40827,6 +55562,243 @@ mulGFNI_6x9_64_loop: mulGFNI_6x9_64_end: RET +// func mulAvx2GFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x9_loop + VZEROUPPER + +mulAvx2GFNI_6x9_end: + RET + // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 @@ -41060,6 +56032,272 @@ mulGFNI_6x9_64Xor_loop: mulGFNI_6x9_64Xor_end: RET +// func mulAvx2GFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x9Xor_end: + RET + // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 @@ -42074,6 +57312,262 @@ mulGFNI_6x10_64_loop: mulGFNI_6x10_64_end: RET +// func mulAvx2GFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x10_loop + VZEROUPPER + +mulAvx2GFNI_6x10_end: + RET + // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 @@ -42322,6 +57816,294 @@ mulGFNI_6x10_64Xor_loop: mulGFNI_6x10_64Xor_end: RET +// func mulAvx2GFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x10Xor_end: + RET + // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 @@ -43166,6 +58948,103 @@ mulGFNI_7x1_64_loop: mulGFNI_7x1_64_end: RET +// func mulAvx2GFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvx2GFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x1_loop + VZEROUPPER + +mulAvx2GFNI_7x1_end: + RET + // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 @@ -43267,6 +59146,107 @@ mulGFNI_7x1_64Xor_loop: mulGFNI_7x1_64Xor_end: RET +// func mulAvx2GFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvx2GFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x1Xor_end: + RET + // func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 @@ -44135,6 +60115,127 @@ mulGFNI_7x2_64_loop: mulGFNI_7x2_64_end: RET +// func mulAvx2GFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvx2GFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x2_loop + VZEROUPPER + +mulAvx2GFNI_7x2_end: + RET + // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 @@ -44262,6 +60363,133 @@ mulGFNI_7x2_64Xor_loop: mulGFNI_7x2_64Xor_end: RET +// func mulAvx2GFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvx2GFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x2Xor_end: + RET + // func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 @@ -45357,6 +61585,151 @@ mulGFNI_7x3_64_loop: mulGFNI_7x3_64_end: RET +// func mulAvx2GFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvx2GFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x3_loop + VZEROUPPER + +mulAvx2GFNI_7x3_end: + RET + // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 @@ -45510,6 +61883,159 @@ mulGFNI_7x3_64Xor_loop: mulGFNI_7x3_64Xor_end: RET +// func mulAvx2GFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvx2GFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x3Xor_end: + RET + // func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 @@ -46461,6 +62987,175 @@ mulGFNI_7x4_64_loop: mulGFNI_7x4_64_end: RET +// func mulAvx2GFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvx2GFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x4_loop + VZEROUPPER + +mulAvx2GFNI_7x4_end: + RET + // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 @@ -46638,6 +63333,185 @@ mulGFNI_7x4_64Xor_loop: mulGFNI_7x4_64Xor_end: RET +// func mulAvx2GFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvx2GFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x4Xor_end: + RET + // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 @@ -47368,6 +64242,199 @@ mulGFNI_7x5_64_loop: mulGFNI_7x5_64_end: RET +// func mulAvx2GFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvx2GFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x5_loop + VZEROUPPER + +mulAvx2GFNI_7x5_end: + RET + // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 @@ -47563,6 +64630,211 @@ mulGFNI_7x5_64Xor_loop: mulGFNI_7x5_64Xor_end: RET +// func mulAvx2GFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvx2GFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x5Xor_end: + RET + // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 @@ -48394,6 +65666,227 @@ mulGFNI_7x6_64_loop: mulGFNI_7x6_64_end: RET +// func mulAvx2GFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_7x6_loop + VZEROUPPER + +mulAvx2GFNI_7x6_end: + RET + // func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 @@ -48611,6 +66104,241 @@ mulGFNI_7x6_64Xor_loop: mulGFNI_7x6_64Xor_end: RET +// func mulAvx2GFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_7x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x6Xor_end: + RET + // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 @@ -49503,6 +67231,232 @@ mulGFNI_7x7_64_loop: mulGFNI_7x7_64_end: RET +// func mulAvx2GFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x7_loop + VZEROUPPER + +mulAvx2GFNI_7x7_end: + RET + // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 @@ -49726,6 +67680,255 @@ mulGFNI_7x7_64Xor_loop: mulGFNI_7x7_64Xor_end: RET +// func mulAvx2GFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x7Xor_end: + RET + // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 @@ -50699,6 +68902,254 @@ mulGFNI_7x8_64_loop: mulGFNI_7x8_64_end: RET +// func mulAvx2GFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x8_loop + VZEROUPPER + +mulAvx2GFNI_7x8_end: + RET + // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 @@ -50939,6 +69390,280 @@ mulGFNI_7x8_64Xor_loop: mulGFNI_7x8_64Xor_end: RET +// func mulAvx2GFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x8Xor_end: + RET + // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 @@ -52002,6 +70727,276 @@ mulGFNI_7x9_64_loop: mulGFNI_7x9_64_end: RET +// func mulAvx2GFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x9_loop + VZEROUPPER + +mulAvx2GFNI_7x9_end: + RET + // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 @@ -52259,6 +71254,305 @@ mulGFNI_7x9_64Xor_loop: mulGFNI_7x9_64Xor_end: RET +// func mulAvx2GFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x9Xor_end: + RET + // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 @@ -53412,6 +72706,298 @@ mulGFNI_7x10_64_loop: mulGFNI_7x10_64_end: RET +// func mulAvx2GFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x10_loop + VZEROUPPER + +mulAvx2GFNI_7x10_end: + RET + // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 @@ -53686,6 +73272,330 @@ mulGFNI_7x10_64Xor_loop: mulGFNI_7x10_64Xor_end: RET +// func mulAvx2GFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x10Xor_end: + RET + // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 @@ -54633,6 +74543,112 @@ mulGFNI_8x1_64_loop: mulGFNI_8x1_64_end: RET +// func mulAvx2GFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvx2GFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x1_loop + VZEROUPPER + +mulAvx2GFNI_8x1_end: + RET + // func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 @@ -54743,6 +74759,116 @@ mulGFNI_8x1_64Xor_loop: mulGFNI_8x1_64Xor_end: RET +// func mulAvx2GFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvx2GFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x1Xor_end: + RET + // func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 @@ -55706,6 +75832,139 @@ mulGFNI_8x2_64_loop: mulGFNI_8x2_64_end: RET +// func mulAvx2GFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvx2GFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x2_loop + VZEROUPPER + +mulAvx2GFNI_8x2_end: + RET + // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 @@ -55845,6 +76104,145 @@ mulGFNI_8x2_64Xor_loop: mulGFNI_8x2_64Xor_end: RET +// func mulAvx2GFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvx2GFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x2Xor_end: + RET + // func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 @@ -57064,6 +77462,166 @@ mulGFNI_8x3_64_loop: mulGFNI_8x3_64_end: RET +// func mulAvx2GFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvx2GFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x3_loop + VZEROUPPER + +mulAvx2GFNI_8x3_end: + RET + // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 @@ -57232,6 +77790,174 @@ mulGFNI_8x3_64Xor_loop: mulGFNI_8x3_64Xor_end: RET +// func mulAvx2GFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvx2GFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x3Xor_end: + RET + // func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 @@ -58287,6 +79013,193 @@ mulGFNI_8x4_64_loop: mulGFNI_8x4_64_end: RET +// func mulAvx2GFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvx2GFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x4_loop + VZEROUPPER + +mulAvx2GFNI_8x4_end: + RET + // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 @@ -58478,6 +79391,203 @@ mulGFNI_8x4_64Xor_loop: mulGFNI_8x4_64Xor_end: RET +// func mulAvx2GFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvx2GFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x4Xor_end: + RET + // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 @@ -59293,6 +80403,224 @@ mulGFNI_8x5_64_loop: mulGFNI_8x5_64_end: RET +// func mulAvx2GFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_8x5_loop + VZEROUPPER + +mulAvx2GFNI_8x5_end: + RET + // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 @@ -59508,6 +80836,236 @@ mulGFNI_8x5_64Xor_loop: mulGFNI_8x5_64Xor_end: RET +// func mulAvx2GFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_8x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x5Xor_end: + RET + // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 @@ -60400,6 +81958,234 @@ mulGFNI_8x6_64_loop: mulGFNI_8x6_64_end: RET +// func mulAvx2GFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x6_loop + VZEROUPPER + +mulAvx2GFNI_8x6_end: + RET + // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 @@ -60624,6 +82410,254 @@ mulGFNI_8x6_64Xor_loop: mulGFNI_8x6_64Xor_end: RET +// func mulAvx2GFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x6Xor_end: + RET + // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 @@ -61610,6 +83644,259 @@ mulGFNI_8x7_64_loop: mulGFNI_8x7_64_end: RET +// func mulAvx2GFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x7_loop + VZEROUPPER + +mulAvx2GFNI_8x7_end: + RET + // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 @@ -61853,6 +84140,282 @@ mulGFNI_8x7_64Xor_loop: mulGFNI_8x7_64Xor_end: RET +// func mulAvx2GFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x7Xor_end: + RET + // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 @@ -62941,6 +85504,284 @@ mulGFNI_8x8_64_loop: mulGFNI_8x8_64_end: RET +// func mulAvx2GFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x8_loop + VZEROUPPER + +mulAvx2GFNI_8x8_end: + RET + // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 @@ -63203,6 +86044,310 @@ mulGFNI_8x8_64Xor_loop: mulGFNI_8x8_64Xor_end: RET +// func mulAvx2GFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x8Xor_end: + RET + // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 @@ -64393,6 +87538,309 @@ mulGFNI_8x9_64_loop: mulGFNI_8x9_64_end: RET +// func mulAvx2GFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x9_loop + VZEROUPPER + +mulAvx2GFNI_8x9_end: + RET + // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 @@ -64674,6 +88122,338 @@ mulGFNI_8x9_64Xor_loop: mulGFNI_8x9_64Xor_end: RET +// func mulAvx2GFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x9Xor_end: + RET + // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 @@ -65966,6 +89746,334 @@ mulGFNI_8x10_64_loop: mulGFNI_8x10_64_end: RET +// func mulAvx2GFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x10_loop + VZEROUPPER + +mulAvx2GFNI_8x10_end: + RET + // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 @@ -66266,6 +90374,366 @@ mulGFNI_8x10_64Xor_loop: mulGFNI_8x10_64Xor_end: RET +// func mulAvx2GFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x10Xor_end: + RET + // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 @@ -67316,6 +91784,121 @@ mulGFNI_9x1_64_loop: mulGFNI_9x1_64_end: RET +// func mulAvx2GFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvx2GFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x1_loop + VZEROUPPER + +mulAvx2GFNI_9x1_end: + RET + // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 @@ -67435,6 +92018,125 @@ mulGFNI_9x1_64Xor_loop: mulGFNI_9x1_64Xor_end: RET +// func mulAvx2GFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvx2GFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x1Xor_end: + RET + // func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 @@ -68408,12 +93110,308 @@ TEXT ·mulGFNI_9x2_64(SB), $0-88 ADDQ R14, R11 ADDQ R14, CX -mulGFNI_9x2_64_loop: +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulAvx2GFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvx2GFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x2_loop + VZEROUPPER + +mulAvx2GFNI_9x2_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU64 (DX), Z20 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 - VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z19, Z21, Z19 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU64 (BX), Z20 @@ -68487,161 +93485,161 @@ mulGFNI_9x2_64_loop: // Prepare for next loop DECQ AX - JNZ mulGFNI_9x2_64_loop + JNZ mulGFNI_9x2_64Xor_loop VZEROUPPER -mulGFNI_9x2_64_end: +mulGFNI_9x2_64Xor_end: RET -// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 - // Loading all tables to registers +// func mulAvx2GFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_9x2_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), R9 - MOVQ 144(CX), R10 - MOVQ 168(CX), R11 - MOVQ 192(CX), CX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 + ADDQ R15, R14 + ADDQ R15, R13 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, CX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX -mulGFNI_9x2_64Xor_loop: +mulAvx2GFNI_9x2Xor_loop: // Load 2 outputs - VMOVDQU64 (R13), Z18 - VMOVDQU64 (R12), Z19 + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU64 (DX), Z20 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU64 (BX), Z20 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU64 (SI), Z20 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU64 (DI), Z20 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU64 (R8), Z20 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU64 (R9), Z20 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU64 (R10), Z20 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU64 (R11), Z20 - ADDQ $0x40, R11 - VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Load and process 64 bytes from input 8 to 2 outputs - VMOVDQU64 (CX), Z20 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 - VXORPD Z19, Z21, Z19 + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU64 Z18, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z19, (R12) - ADDQ $0x40, R12 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulGFNI_9x2_64Xor_loop + JNZ mulAvx2GFNI_9x2Xor_loop VZEROUPPER -mulGFNI_9x2_64Xor_end: +mulAvx2GFNI_9x2Xor_end: RET // func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -69987,6 +94985,181 @@ mulGFNI_9x3_64_loop: mulGFNI_9x3_64_end: RET +// func mulAvx2GFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvx2GFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x3_loop + VZEROUPPER + +mulAvx2GFNI_9x3_end: + RET + // func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 @@ -70170,6 +95343,189 @@ mulGFNI_9x3_64Xor_loop: mulGFNI_9x3_64Xor_end: RET +// func mulAvx2GFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvx2GFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x3Xor_end: + RET + // func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 @@ -71335,6 +96691,215 @@ mulGFNI_9x4_64_loop: mulGFNI_9x4_64_end: RET +// func mulAvx2GFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_9x4_loop + VZEROUPPER + +mulAvx2GFNI_9x4_end: + RET + // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 @@ -71544,6 +97109,225 @@ mulGFNI_9x4_64Xor_loop: mulGFNI_9x4_64Xor_end: RET +// func mulAvx2GFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_9x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x4Xor_end: + RET + // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 @@ -72412,6 +98196,230 @@ mulGFNI_9x5_64_loop: mulGFNI_9x5_64_end: RET +// func mulAvx2GFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x5_loop + VZEROUPPER + +mulAvx2GFNI_9x5_end: + RET + // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 @@ -72633,6 +98641,247 @@ mulGFNI_9x5_64Xor_loop: mulGFNI_9x5_64Xor_end: RET +// func mulAvx2GFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x5Xor_end: + RET + // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 @@ -73608,6 +99857,258 @@ mulGFNI_9x6_64_loop: mulGFNI_9x6_64_end: RET +// func mulAvx2GFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x6_loop + VZEROUPPER + +mulAvx2GFNI_9x6_end: + RET + // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 @@ -73850,6 +100351,278 @@ mulGFNI_9x6_64Xor_loop: mulGFNI_9x6_64Xor_end: RET +// func mulAvx2GFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x6Xor_end: + RET + // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 @@ -74939,6 +101712,286 @@ mulGFNI_9x7_64_loop: mulGFNI_9x7_64_end: RET +// func mulAvx2GFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x7_loop + VZEROUPPER + +mulAvx2GFNI_9x7_end: + RET + // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 @@ -75202,6 +102255,309 @@ mulGFNI_9x7_64Xor_loop: mulGFNI_9x7_64Xor_end: RET +// func mulAvx2GFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x7Xor_end: + RET + // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 @@ -76405,6 +103761,314 @@ mulGFNI_9x8_64_loop: mulGFNI_9x8_64_end: RET +// func mulAvx2GFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x8_loop + VZEROUPPER + +mulAvx2GFNI_9x8_end: + RET + // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 @@ -76689,6 +104353,340 @@ mulGFNI_9x8_64Xor_loop: mulGFNI_9x8_64Xor_end: RET +// func mulAvx2GFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x8Xor_end: + RET + // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 @@ -78006,6 +106004,342 @@ mulGFNI_9x9_64_loop: mulGFNI_9x9_64_end: RET +// func mulAvx2GFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x9_loop + VZEROUPPER + +mulAvx2GFNI_9x9_end: + RET + // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 @@ -78311,6 +106645,371 @@ mulGFNI_9x9_64Xor_loop: mulGFNI_9x9_64Xor_end: RET +// func mulAvx2GFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x9Xor_end: + RET + // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 @@ -79742,6 +108441,370 @@ mulGFNI_9x10_64_loop: mulGFNI_9x10_64_end: RET +// func mulAvx2GFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x10_loop + VZEROUPPER + +mulAvx2GFNI_9x10_end: + RET + // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 @@ -80068,6 +109131,402 @@ mulGFNI_9x10_64Xor_loop: mulGFNI_9x10_64Xor_end: RET +// func mulAvx2GFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x10Xor_end: + RET + // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 @@ -81221,6 +110680,130 @@ mulGFNI_10x1_64_loop: mulGFNI_10x1_64_end: RET +// func mulAvx2GFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvx2GFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x1_loop + VZEROUPPER + +mulAvx2GFNI_10x1_end: + RET + // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 @@ -81349,6 +110932,134 @@ mulGFNI_10x1_64Xor_loop: mulGFNI_10x1_64Xor_end: RET +// func mulAvx2GFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvx2GFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x1Xor_end: + RET + // func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 @@ -82502,6 +112213,163 @@ mulGFNI_10x2_64_loop: mulGFNI_10x2_64_end: RET +// func mulAvx2GFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvx2GFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x2_loop + VZEROUPPER + +mulAvx2GFNI_10x2_end: + RET + // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 @@ -82665,6 +112533,169 @@ mulGFNI_10x2_64Xor_loop: mulGFNI_10x2_64Xor_end: RET +// func mulAvx2GFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvx2GFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x2Xor_end: + RET + // func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 @@ -84139,6 +114170,200 @@ mulGFNI_10x3_64_loop: mulGFNI_10x3_64_end: RET +// func mulAvx2GFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_10x3_loop + VZEROUPPER + +mulAvx2GFNI_10x3_end: + RET + // func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 @@ -84338,6 +114563,208 @@ mulGFNI_10x3_64Xor_loop: mulGFNI_10x3_64Xor_end: RET +// func mulAvx2GFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_10x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x3Xor_end: + RET + // func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 @@ -85589,6 +116016,220 @@ mulGFNI_10x4_64_loop: mulGFNI_10x4_64_end: RET +// func mulAvx2GFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x4_loop + VZEROUPPER + +mulAvx2GFNI_10x4_end: + RET + // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 @@ -85803,6 +116444,234 @@ mulGFNI_10x4_64Xor_loop: mulGFNI_10x4_64Xor_end: RET +// func mulAvx2GFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x4Xor_end: + RET + // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 @@ -86743,6 +117612,251 @@ mulGFNI_10x5_64_loop: mulGFNI_10x5_64_end: RET +// func mulAvx2GFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x5_loop + VZEROUPPER + +mulAvx2GFNI_10x5_end: + RET + // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 @@ -86980,6 +118094,268 @@ mulGFNI_10x5_64Xor_loop: mulGFNI_10x5_64Xor_end: RET +// func mulAvx2GFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x5Xor_end: + RET + // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 @@ -88046,6 +119422,282 @@ mulGFNI_10x6_64_loop: mulGFNI_10x6_64_end: RET +// func mulAvx2GFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x6_loop + VZEROUPPER + +mulAvx2GFNI_10x6_end: + RET + // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 @@ -88306,6 +119958,302 @@ mulGFNI_10x6_64Xor_loop: mulGFNI_10x6_64Xor_end: RET +// func mulAvx2GFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x6Xor_end: + RET + // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 @@ -89498,6 +121446,313 @@ mulGFNI_10x7_64_loop: mulGFNI_10x7_64_end: RET +// func mulAvx2GFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x7_loop + VZEROUPPER + +mulAvx2GFNI_10x7_end: + RET + // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 @@ -89781,6 +122036,336 @@ mulGFNI_10x7_64Xor_loop: mulGFNI_10x7_64Xor_end: RET +// func mulAvx2GFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x7Xor_end: + RET + // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 @@ -91099,6 +123684,344 @@ mulGFNI_10x8_64_loop: mulGFNI_10x8_64_end: RET +// func mulAvx2GFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x8_loop + VZEROUPPER + +mulAvx2GFNI_10x8_end: + RET + // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 @@ -91405,6 +124328,370 @@ mulGFNI_10x8_64Xor_loop: mulGFNI_10x8_64Xor_end: RET +// func mulAvx2GFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x8Xor_end: + RET + // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 @@ -92849,6 +126136,375 @@ mulGFNI_10x9_64_loop: mulGFNI_10x9_64_end: RET +// func mulAvx2GFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x9_loop + VZEROUPPER + +mulAvx2GFNI_10x9_end: + RET + // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 @@ -93178,6 +126834,404 @@ mulGFNI_10x9_64Xor_loop: mulGFNI_10x9_64Xor_end: RET +// func mulAvx2GFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x9Xor_end: + RET + // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 @@ -94748,6 +128802,406 @@ mulGFNI_10x10_64_loop: mulGFNI_10x10_64_end: RET +// func mulAvx2GFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x10_loop + VZEROUPPER + +mulAvx2GFNI_10x10_end: + RET + // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 @@ -95100,6 +129554,438 @@ mulGFNI_10x10_64Xor_loop: mulGFNI_10x10_64Xor_end: RET +// func mulAvx2GFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x10Xor_end: + RET + // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 diff --git a/galois_gen_nopshufb_amd64.go b/galois_gen_nopshufb_amd64.go index b07f3f34..84926cf5 100644 --- a/galois_gen_nopshufb_amd64.go +++ b/galois_gen_nopshufb_amd64.go @@ -21,1100 +21,2200 @@ func avx2XorSlice_64(in []byte, out []byte) //go:noescape func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. // //go:noescape func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. // //go:noescape func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. // //go:noescape func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. // //go:noescape func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. // //go:noescape func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. // //go:noescape func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. // //go:noescape func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. // //go:noescape func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. // //go:noescape func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. // //go:noescape func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. // //go:noescape func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. // //go:noescape func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. // //go:noescape func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. // //go:noescape func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. // //go:noescape func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. // //go:noescape func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. // //go:noescape func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. // //go:noescape func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. // //go:noescape func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. // //go:noescape func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. // //go:noescape func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. // //go:noescape func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. // //go:noescape func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. // //go:noescape func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. // //go:noescape func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. // //go:noescape func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. // //go:noescape func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. // //go:noescape func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. // //go:noescape func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. // //go:noescape func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. // //go:noescape func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. // //go:noescape func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. // //go:noescape func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. // //go:noescape func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. // //go:noescape func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. // //go:noescape func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. // //go:noescape func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. // //go:noescape func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. // //go:noescape func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. // //go:noescape func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. // //go:noescape func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. // //go:noescape func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape -func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvx2GFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. // //go:noescape func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. // //go:noescape func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. // //go:noescape func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. // //go:noescape func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. // //go:noescape func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. // //go:noescape func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. // //go:noescape func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. // //go:noescape func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. // //go:noescape func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. // //go:noescape func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. // //go:noescape func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. // //go:noescape func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. // //go:noescape func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. // //go:noescape func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. // //go:noescape func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. // //go:noescape func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. // //go:noescape func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. // //go:noescape func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. // //go:noescape func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. // //go:noescape func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. // //go:noescape func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvx2GFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvx2GFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvx2GFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. // //go:noescape func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvx2GFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. // //go:noescape func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvx2GFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. // //go:noescape func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvx2GFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. // //go:noescape func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvx2GFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. // //go:noescape func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvx2GFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. // //go:noescape func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvx2GFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvx2GFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. // //go:noescape func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvx2GFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvx2GFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + //go:noescape func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/galois_gen_nopshufb_amd64.s b/galois_gen_nopshufb_amd64.s index 574dfe9b..34d23df3 100644 --- a/galois_gen_nopshufb_amd64.s +++ b/galois_gen_nopshufb_amd64.s @@ -153,6 +153,49 @@ mulGFNI_1x1_64_loop: mulGFNI_1x1_64_end: RET +// func mulAvx2GFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvx2GFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x1_loop + VZEROUPPER + +mulAvx2GFNI_1x1_end: + RET + // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 @@ -200,6 +243,53 @@ mulGFNI_1x1_64Xor_loop: mulGFNI_1x1_64Xor_end: RET +// func mulAvx2GFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvx2GFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x1Xor_end: + RET + // func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64(SB), $0-88 @@ -249,6 +339,55 @@ mulGFNI_1x2_64_loop: mulGFNI_1x2_64_end: RET +// func mulAvx2GFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvx2GFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x2_loop + VZEROUPPER + +mulAvx2GFNI_1x2_end: + RET + // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 @@ -304,6 +443,61 @@ mulGFNI_1x2_64Xor_loop: mulGFNI_1x2_64Xor_end: RET +// func mulAvx2GFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvx2GFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x2Xor_end: + RET + // func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64(SB), $0-88 @@ -359,6 +553,61 @@ mulGFNI_1x3_64_loop: mulGFNI_1x3_64_end: RET +// func mulAvx2GFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvx2GFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x3_loop + VZEROUPPER + +mulAvx2GFNI_1x3_end: + RET + // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 @@ -422,6 +671,69 @@ mulGFNI_1x3_64Xor_loop: mulGFNI_1x3_64Xor_end: RET +// func mulAvx2GFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvx2GFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x3Xor_end: + RET + // func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64(SB), $0-88 @@ -483,6 +795,67 @@ mulGFNI_1x4_64_loop: mulGFNI_1x4_64_end: RET +// func mulAvx2GFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvx2GFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x4_loop + VZEROUPPER + +mulAvx2GFNI_1x4_end: + RET + // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 @@ -554,6 +927,77 @@ mulGFNI_1x4_64Xor_loop: mulGFNI_1x4_64Xor_end: RET +// func mulAvx2GFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvx2GFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x4Xor_end: + RET + // func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64(SB), $0-88 @@ -621,6 +1065,73 @@ mulGFNI_1x5_64_loop: mulGFNI_1x5_64_end: RET +// func mulAvx2GFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvx2GFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x5_loop + VZEROUPPER + +mulAvx2GFNI_1x5_end: + RET + // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 @@ -700,6 +1211,85 @@ mulGFNI_1x5_64Xor_loop: mulGFNI_1x5_64Xor_end: RET +// func mulAvx2GFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvx2GFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x5Xor_end: + RET + // func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64(SB), $0-88 @@ -773,6 +1363,79 @@ mulGFNI_1x6_64_loop: mulGFNI_1x6_64_end: RET +// func mulAvx2GFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvx2GFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x6_loop + VZEROUPPER + +mulAvx2GFNI_1x6_end: + RET + // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 @@ -860,6 +1523,93 @@ mulGFNI_1x6_64Xor_loop: mulGFNI_1x6_64Xor_end: RET +// func mulAvx2GFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvx2GFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x6Xor_end: + RET + // func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64(SB), $0-88 @@ -939,6 +1689,85 @@ mulGFNI_1x7_64_loop: mulGFNI_1x7_64_end: RET +// func mulAvx2GFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvx2GFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x7_loop + VZEROUPPER + +mulAvx2GFNI_1x7_end: + RET + // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 @@ -1034,6 +1863,101 @@ mulGFNI_1x7_64Xor_loop: mulGFNI_1x7_64Xor_end: RET +// func mulAvx2GFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvx2GFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x7Xor_end: + RET + // func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64(SB), $0-88 @@ -1119,6 +2043,91 @@ mulGFNI_1x8_64_loop: mulGFNI_1x8_64_end: RET +// func mulAvx2GFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvx2GFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x8_loop + VZEROUPPER + +mulAvx2GFNI_1x8_end: + RET + // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 @@ -1222,6 +2231,109 @@ mulGFNI_1x8_64Xor_loop: mulGFNI_1x8_64Xor_end: RET +// func mulAvx2GFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvx2GFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x8Xor_end: + RET + // func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64(SB), $0-88 @@ -1313,6 +2425,97 @@ mulGFNI_1x9_64_loop: mulGFNI_1x9_64_end: RET +// func mulAvx2GFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvx2GFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x9_loop + VZEROUPPER + +mulAvx2GFNI_1x9_end: + RET + // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 @@ -1424,45 +2627,38 @@ mulGFNI_1x9_64Xor_loop: mulGFNI_1x9_64Xor_end: RET -// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_1x10_64(SB), $0-88 - // Loading all tables to registers +// func mulAvx2GFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_1x10_64_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ start+72(FP), R14 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 @@ -1471,59 +2667,80 @@ TEXT ·mulGFNI_1x10_64(SB), $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 - ADDQ R14, DX + ADDQ R14, BX // Add start offset to input - ADDQ R14, CX + ADDQ R14, DX -mulGFNI_1x10_64_loop: - // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (CX), Z19 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 - VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 - VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 - VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 - VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 - VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 - VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 - VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 - VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 - VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 +mulAvx2GFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Store 10 outputs - VMOVDQU64 Z10, (BX) - ADDQ $0x40, BX - VMOVDQU64 Z11, (SI) - ADDQ $0x40, SI - VMOVDQU64 Z12, (DI) - ADDQ $0x40, DI - VMOVDQU64 Z13, (R8) - ADDQ $0x40, R8 - VMOVDQU64 Z14, (R9) - ADDQ $0x40, R9 - VMOVDQU64 Z15, (R10) - ADDQ $0x40, R10 - VMOVDQU64 Z16, (R11) - ADDQ $0x40, R11 - VMOVDQU64 Z17, (R12) - ADDQ $0x40, R12 - VMOVDQU64 Z18, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z19, (DX) - ADDQ $0x40, DX + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulGFNI_1x10_64_loop + JNZ mulAvx2GFNI_1x9Xor_loop VZEROUPPER -mulGFNI_1x10_64_end: +mulAvx2GFNI_1x9Xor_end: RET -// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 +TEXT ·mulGFNI_1x10_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used @@ -1531,7 +2748,7 @@ TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulGFNI_1x10_64Xor_end + JZ mulGFNI_1x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 @@ -1573,42 +2790,236 @@ TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 // Add start offset to input ADDQ R14, CX -mulGFNI_1x10_64Xor_loop: - // Load 10 outputs - VMOVDQU64 (BX), Z10 - VMOVDQU64 (SI), Z11 - VMOVDQU64 (DI), Z12 - VMOVDQU64 (R8), Z13 - VMOVDQU64 (R9), Z14 - VMOVDQU64 (R10), Z15 - VMOVDQU64 (R11), Z16 - VMOVDQU64 (R12), Z17 - VMOVDQU64 (R13), Z18 - VMOVDQU64 (DX), Z19 - +mulGFNI_1x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (CX), Z20 + VMOVDQU64 (CX), Z19 ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 - VXORPD Z10, Z21, Z10 - VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 - VXORPD Z11, Z21, Z11 - VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 - VXORPD Z12, Z21, Z12 - VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 - VXORPD Z13, Z21, Z13 - VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 - VXORPD Z14, Z21, Z14 - VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 - VXORPD Z15, Z21, Z15 - VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 - VXORPD Z16, Z21, Z16 - VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 - VXORPD Z17, Z21, Z17 - VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 - VXORPD Z19, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulAvx2GFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvx2GFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD (CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x10_loop + VZEROUPPER + +mulAvx2GFNI_1x10_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 // Store 10 outputs VMOVDQU64 Z10, (BX) @@ -1640,6 +3051,125 @@ mulGFNI_1x10_64Xor_loop: mulGFNI_1x10_64Xor_end: RET +// func mulAvx2GFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvx2GFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_1x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_1x10Xor_end: + RET + // func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64(SB), $0-88 @@ -1692,6 +3222,58 @@ mulGFNI_2x1_64_loop: mulGFNI_2x1_64_end: RET +// func mulAvx2GFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvx2GFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x1_loop + VZEROUPPER + +mulAvx2GFNI_2x1_end: + RET + // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 @@ -1748,6 +3330,62 @@ mulGFNI_2x1_64Xor_loop: mulGFNI_2x1_64Xor_end: RET +// func mulAvx2GFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvx2GFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x1Xor_end: + RET + // func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64(SB), $0-88 @@ -1809,6 +3447,67 @@ mulGFNI_2x2_64_loop: mulGFNI_2x2_64_end: RET +// func mulAvx2GFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvx2GFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x2_loop + VZEROUPPER + +mulAvx2GFNI_2x2_end: + RET + // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 @@ -1876,6 +3575,73 @@ mulGFNI_2x2_64Xor_loop: mulGFNI_2x2_64Xor_end: RET +// func mulAvx2GFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvx2GFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x2Xor_end: + RET + // func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64(SB), $0-88 @@ -1946,6 +3712,76 @@ mulGFNI_2x3_64_loop: mulGFNI_2x3_64_end: RET +// func mulAvx2GFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvx2GFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x3_loop + VZEROUPPER + +mulAvx2GFNI_2x3_end: + RET + // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 @@ -2024,6 +3860,84 @@ mulGFNI_2x3_64Xor_loop: mulGFNI_2x3_64Xor_end: RET +// func mulAvx2GFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvx2GFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x3Xor_end: + RET + // func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64(SB), $0-88 @@ -2103,6 +4017,85 @@ mulGFNI_2x4_64_loop: mulGFNI_2x4_64_end: RET +// func mulAvx2GFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvx2GFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x4_loop + VZEROUPPER + +mulAvx2GFNI_2x4_end: + RET + // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 @@ -2192,6 +4185,95 @@ mulGFNI_2x4_64Xor_loop: mulGFNI_2x4_64Xor_end: RET +// func mulAvx2GFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvx2GFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x4Xor_end: + RET + // func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64(SB), $0-88 @@ -2280,6 +4362,94 @@ mulGFNI_2x5_64_loop: mulGFNI_2x5_64_end: RET +// func mulAvx2GFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvx2GFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x5_loop + VZEROUPPER + +mulAvx2GFNI_2x5_end: + RET + // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 @@ -2380,6 +4550,106 @@ mulGFNI_2x5_64Xor_loop: mulGFNI_2x5_64Xor_end: RET +// func mulAvx2GFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvx2GFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x5Xor_end: + RET + // func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64(SB), $0-88 @@ -2477,6 +4747,103 @@ mulGFNI_2x6_64_loop: mulGFNI_2x6_64_end: RET +// func mulAvx2GFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvx2GFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x6_loop + VZEROUPPER + +mulAvx2GFNI_2x6_end: + RET + // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 @@ -2588,6 +4955,117 @@ mulGFNI_2x6_64Xor_loop: mulGFNI_2x6_64Xor_end: RET +// func mulAvx2GFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvx2GFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x6Xor_end: + RET + // func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64(SB), $0-88 @@ -2694,6 +5172,112 @@ mulGFNI_2x7_64_loop: mulGFNI_2x7_64_end: RET +// func mulAvx2GFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvx2GFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x7_loop + VZEROUPPER + +mulAvx2GFNI_2x7_end: + RET + // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 @@ -2816,6 +5400,128 @@ mulGFNI_2x7_64Xor_loop: mulGFNI_2x7_64Xor_end: RET +// func mulAvx2GFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvx2GFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x7Xor_end: + RET + // func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64(SB), $0-88 @@ -2931,6 +5637,121 @@ mulGFNI_2x8_64_loop: mulGFNI_2x8_64_end: RET +// func mulAvx2GFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvx2GFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x8_loop + VZEROUPPER + +mulAvx2GFNI_2x8_end: + RET + // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 @@ -3064,6 +5885,139 @@ mulGFNI_2x8_64Xor_loop: mulGFNI_2x8_64Xor_end: RET +// func mulAvx2GFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvx2GFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x8Xor_end: + RET + // func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64(SB), $0-88 @@ -3188,6 +6142,130 @@ mulGFNI_2x9_64_loop: mulGFNI_2x9_64_end: RET +// func mulAvx2GFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvx2GFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x9_loop + VZEROUPPER + +mulAvx2GFNI_2x9_end: + RET + // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 @@ -3332,6 +6410,150 @@ mulGFNI_2x9_64Xor_loop: mulGFNI_2x9_64Xor_end: RET +// func mulAvx2GFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvx2GFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x9Xor_end: + RET + // func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64(SB), $0-88 @@ -3465,6 +6687,139 @@ mulGFNI_2x10_64_loop: mulGFNI_2x10_64_end: RET +// func mulAvx2GFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvx2GFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x10_loop + VZEROUPPER + +mulAvx2GFNI_2x10_end: + RET + // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 @@ -3620,6 +6975,161 @@ mulGFNI_2x10_64Xor_loop: mulGFNI_2x10_64Xor_end: RET +// func mulAvx2GFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvx2GFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_2x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_2x10Xor_end: + RET + // func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64(SB), $0-88 @@ -3681,6 +7191,67 @@ mulGFNI_3x1_64_loop: mulGFNI_3x1_64_end: RET +// func mulAvx2GFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvx2GFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x1_loop + VZEROUPPER + +mulAvx2GFNI_3x1_end: + RET + // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 @@ -3746,6 +7317,71 @@ mulGFNI_3x1_64Xor_loop: mulGFNI_3x1_64Xor_end: RET +// func mulAvx2GFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvx2GFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x1Xor_end: + RET + // func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64(SB), $0-88 @@ -3819,6 +7455,79 @@ mulGFNI_3x2_64_loop: mulGFNI_3x2_64_end: RET +// func mulAvx2GFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvx2GFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x2_loop + VZEROUPPER + +mulAvx2GFNI_3x2_end: + RET + // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 @@ -3898,6 +7607,85 @@ mulGFNI_3x2_64Xor_loop: mulGFNI_3x2_64Xor_end: RET +// func mulAvx2GFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvx2GFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x2Xor_end: + RET + // func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64(SB), $0-88 @@ -3983,6 +7771,91 @@ mulGFNI_3x3_64_loop: mulGFNI_3x3_64_end: RET +// func mulAvx2GFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvx2GFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x3_loop + VZEROUPPER + +mulAvx2GFNI_3x3_end: + RET + // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 @@ -4076,6 +7949,99 @@ mulGFNI_3x3_64Xor_loop: mulGFNI_3x3_64Xor_end: RET +// func mulAvx2GFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvx2GFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x3Xor_end: + RET + // func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64(SB), $0-88 @@ -4173,6 +8139,103 @@ mulGFNI_3x4_64_loop: mulGFNI_3x4_64_end: RET +// func mulAvx2GFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvx2GFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x4_loop + VZEROUPPER + +mulAvx2GFNI_3x4_end: + RET + // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 @@ -4280,6 +8343,113 @@ mulGFNI_3x4_64Xor_loop: mulGFNI_3x4_64Xor_end: RET +// func mulAvx2GFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvx2GFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x4Xor_end: + RET + // func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64(SB), $0-88 @@ -4389,6 +8559,115 @@ mulGFNI_3x5_64_loop: mulGFNI_3x5_64_end: RET +// func mulAvx2GFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvx2GFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x5_loop + VZEROUPPER + +mulAvx2GFNI_3x5_end: + RET + // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 @@ -4510,6 +8789,127 @@ mulGFNI_3x5_64Xor_loop: mulGFNI_3x5_64Xor_end: RET +// func mulAvx2GFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvx2GFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x5Xor_end: + RET + // func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64(SB), $0-88 @@ -4631,6 +9031,127 @@ mulGFNI_3x6_64_loop: mulGFNI_3x6_64_end: RET +// func mulAvx2GFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvx2GFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x6_loop + VZEROUPPER + +mulAvx2GFNI_3x6_end: + RET + // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 @@ -4766,6 +9287,141 @@ mulGFNI_3x6_64Xor_loop: mulGFNI_3x6_64Xor_end: RET +// func mulAvx2GFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvx2GFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x6Xor_end: + RET + // func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64(SB), $0-88 @@ -4899,6 +9555,139 @@ mulGFNI_3x7_64_loop: mulGFNI_3x7_64_end: RET +// func mulAvx2GFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvx2GFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x7_loop + VZEROUPPER + +mulAvx2GFNI_3x7_end: + RET + // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 @@ -5048,6 +9837,155 @@ mulGFNI_3x7_64Xor_loop: mulGFNI_3x7_64Xor_end: RET +// func mulAvx2GFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvx2GFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x7Xor_end: + RET + // func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64(SB), $0-88 @@ -5191,6 +10129,151 @@ mulGFNI_3x8_64_loop: mulGFNI_3x8_64_end: RET +// func mulAvx2GFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvx2GFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x8_loop + VZEROUPPER + +mulAvx2GFNI_3x8_end: + RET + // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 @@ -5352,6 +10435,169 @@ mulGFNI_3x8_64Xor_loop: mulGFNI_3x8_64Xor_end: RET +// func mulAvx2GFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvx2GFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x8Xor_end: + RET + // func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64(SB), $8-88 @@ -5503,6 +10749,163 @@ mulGFNI_3x9_64_loop: mulGFNI_3x9_64_end: RET +// func mulAvx2GFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvx2GFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x9_loop + VZEROUPPER + +mulAvx2GFNI_3x9_end: + RET + // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 @@ -5674,6 +11077,183 @@ mulGFNI_3x9_64Xor_loop: mulGFNI_3x9_64Xor_end: RET +// func mulAvx2GFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvx2GFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_3x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x9Xor_end: + RET + // func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64(SB), $8-88 @@ -5837,6 +11417,179 @@ mulGFNI_3x10_64_loop: mulGFNI_3x10_64_end: RET +// func mulAvx2GFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_3x10_loop + VZEROUPPER + +mulAvx2GFNI_3x10_end: + RET + // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 @@ -6022,6 +11775,201 @@ mulGFNI_3x10_64Xor_loop: mulGFNI_3x10_64Xor_end: RET +// func mulAvx2GFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_3x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_3x10Xor_end: + RET + // func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64(SB), $0-88 @@ -6092,6 +12040,76 @@ mulGFNI_4x1_64_loop: mulGFNI_4x1_64_end: RET +// func mulAvx2GFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvx2GFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x1_loop + VZEROUPPER + +mulAvx2GFNI_4x1_end: + RET + // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 @@ -6166,6 +12184,80 @@ mulGFNI_4x1_64Xor_loop: mulGFNI_4x1_64Xor_end: RET +// func mulAvx2GFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvx2GFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x1Xor_end: + RET + // func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64(SB), $0-88 @@ -6251,6 +12343,91 @@ mulGFNI_4x2_64_loop: mulGFNI_4x2_64_end: RET +// func mulAvx2GFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvx2GFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x2_loop + VZEROUPPER + +mulAvx2GFNI_4x2_end: + RET + // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 @@ -6342,6 +12519,97 @@ mulGFNI_4x2_64Xor_loop: mulGFNI_4x2_64Xor_end: RET +// func mulAvx2GFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvx2GFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x2Xor_end: + RET + // func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64(SB), $0-88 @@ -6442,6 +12710,106 @@ mulGFNI_4x3_64_loop: mulGFNI_4x3_64_end: RET +// func mulAvx2GFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvx2GFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x3_loop + VZEROUPPER + +mulAvx2GFNI_4x3_end: + RET + // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 @@ -6550,6 +12918,114 @@ mulGFNI_4x3_64Xor_loop: mulGFNI_4x3_64Xor_end: RET +// func mulAvx2GFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvx2GFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x3Xor_end: + RET + // func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64(SB), $0-88 @@ -6665,6 +13141,121 @@ mulGFNI_4x4_64_loop: mulGFNI_4x4_64_end: RET +// func mulAvx2GFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvx2GFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x4_loop + VZEROUPPER + +mulAvx2GFNI_4x4_end: + RET + // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 @@ -6790,6 +13381,131 @@ mulGFNI_4x4_64Xor_loop: mulGFNI_4x4_64Xor_end: RET +// func mulAvx2GFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvx2GFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x4Xor_end: + RET + // func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64(SB), $0-88 @@ -6920,6 +13636,136 @@ mulGFNI_4x5_64_loop: mulGFNI_4x5_64_end: RET +// func mulAvx2GFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvx2GFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x5_loop + VZEROUPPER + +mulAvx2GFNI_4x5_end: + RET + // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 @@ -7062,6 +13908,148 @@ mulGFNI_4x5_64Xor_loop: mulGFNI_4x5_64Xor_end: RET +// func mulAvx2GFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvx2GFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x5Xor_end: + RET + // func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64(SB), $0-88 @@ -7207,6 +14195,151 @@ mulGFNI_4x6_64_loop: mulGFNI_4x6_64_end: RET +// func mulAvx2GFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvx2GFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x6_loop + VZEROUPPER + +mulAvx2GFNI_4x6_end: + RET + // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 @@ -7366,6 +14499,165 @@ mulGFNI_4x6_64Xor_loop: mulGFNI_4x6_64Xor_end: RET +// func mulAvx2GFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvx2GFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x6Xor_end: + RET + // func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64(SB), $0-88 @@ -7521,6 +14813,166 @@ mulGFNI_4x7_64_loop: mulGFNI_4x7_64_end: RET +// func mulAvx2GFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvx2GFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x7_loop + VZEROUPPER + +mulAvx2GFNI_4x7_end: + RET + // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 @@ -7692,6 +15144,182 @@ mulGFNI_4x7_64Xor_loop: mulGFNI_4x7_64Xor_end: RET +// func mulAvx2GFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvx2GFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x7Xor_end: + RET + // func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64(SB), $8-88 @@ -7857,6 +15485,181 @@ mulGFNI_4x8_64_loop: mulGFNI_4x8_64_end: RET +// func mulAvx2GFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvx2GFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x8_loop + VZEROUPPER + +mulAvx2GFNI_4x8_end: + RET + // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 @@ -8040,6 +15843,199 @@ mulGFNI_4x8_64Xor_loop: mulGFNI_4x8_64Xor_end: RET +// func mulAvx2GFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvx2GFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_4x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x8Xor_end: + RET + // func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64(SB), $8-88 @@ -8219,6 +16215,200 @@ mulGFNI_4x9_64_loop: mulGFNI_4x9_64_end: RET +// func mulAvx2GFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_4x9_loop + VZEROUPPER + +mulAvx2GFNI_4x9_end: + RET + // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 @@ -8418,6 +16608,220 @@ mulGFNI_4x9_64Xor_loop: mulGFNI_4x9_64Xor_end: RET +// func mulAvx2GFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_4x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_4x9Xor_end: + RET + // func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64(SB), $0-88 @@ -8576,51 +16980,415 @@ mulGFNI_4x10_64_loop: // Prepare for next loop ADDQ $0x40, R9 DECQ AX - JNZ mulGFNI_4x10_64_loop + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulAvx2GFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvx2GFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvx2GFNI_4x10_loop + VZEROUPPER + +mulAvx2GFNI_4x10_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop VZEROUPPER -mulGFNI_4x10_64_end: +mulGFNI_4x10_64Xor_end: RET -// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 - // Loading 20 of 40 tables to registers +// func mulAvx2GFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_4x10_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX @@ -8628,154 +17396,190 @@ TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 ADDQ R9, DI ADDQ R9, DX -mulGFNI_4x10_64Xor_loop: +mulAvx2GFNI_4x10Xor_loop: // Load 10 outputs - MOVQ (R8), R10 - VMOVDQU64 (R10)(R9*1), Z20 - MOVQ 24(R8), R10 - VMOVDQU64 (R10)(R9*1), Z21 - MOVQ 48(R8), R10 - VMOVDQU64 (R10)(R9*1), Z22 - MOVQ 72(R8), R10 - VMOVDQU64 (R10)(R9*1), Z23 - MOVQ 96(R8), R10 - VMOVDQU64 (R10)(R9*1), Z24 - MOVQ 120(R8), R10 - VMOVDQU64 (R10)(R9*1), Z25 - MOVQ 144(R8), R10 - VMOVDQU64 (R10)(R9*1), Z26 - MOVQ 168(R8), R10 - VMOVDQU64 (R10)(R9*1), Z27 - MOVQ 192(R8), R10 - VMOVDQU64 (R10)(R9*1), Z28 - MOVQ 216(R8), R10 - VMOVDQU64 (R10)(R9*1), Z29 - - // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 10 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 10 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 10 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 10 outputs - MOVQ (R8), R10 - VMOVDQU64 Z20, (R10)(R9*1) - MOVQ 24(R8), R10 - VMOVDQU64 Z21, (R10)(R9*1) - MOVQ 48(R8), R10 - VMOVDQU64 Z22, (R10)(R9*1) - MOVQ 72(R8), R10 - VMOVDQU64 Z23, (R10)(R9*1) - MOVQ 96(R8), R10 - VMOVDQU64 Z24, (R10)(R9*1) - MOVQ 120(R8), R10 - VMOVDQU64 Z25, (R10)(R9*1) - MOVQ 144(R8), R10 - VMOVDQU64 Z26, (R10)(R9*1) - MOVQ 168(R8), R10 - VMOVDQU64 Z27, (R10)(R9*1) - MOVQ 192(R8), R10 - VMOVDQU64 Z28, (R10)(R9*1) - MOVQ 216(R8), R10 - VMOVDQU64 Z29, (R10)(R9*1) + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) // Prepare for next loop - ADDQ $0x40, R9 + ADDQ $0x20, R9 DECQ AX - JNZ mulGFNI_4x10_64Xor_loop + JNZ mulAvx2GFNI_4x10Xor_loop VZEROUPPER -mulGFNI_4x10_64Xor_end: +mulAvx2GFNI_4x10Xor_end: RET // func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -8857,6 +17661,85 @@ mulGFNI_5x1_64_loop: mulGFNI_5x1_64_end: RET +// func mulAvx2GFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvx2GFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x1_loop + VZEROUPPER + +mulAvx2GFNI_5x1_end: + RET + // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 @@ -8940,6 +17823,89 @@ mulGFNI_5x1_64Xor_loop: mulGFNI_5x1_64Xor_end: RET +// func mulAvx2GFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvx2GFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x1Xor_end: + RET + // func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64(SB), $0-88 @@ -9037,6 +18003,103 @@ mulGFNI_5x2_64_loop: mulGFNI_5x2_64_end: RET +// func mulAvx2GFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvx2GFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x2_loop + VZEROUPPER + +mulAvx2GFNI_5x2_end: + RET + // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 @@ -9140,6 +18203,109 @@ mulGFNI_5x2_64Xor_loop: mulGFNI_5x2_64Xor_end: RET +// func mulAvx2GFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvx2GFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x2Xor_end: + RET + // func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64(SB), $0-88 @@ -9255,6 +18421,121 @@ mulGFNI_5x3_64_loop: mulGFNI_5x3_64_end: RET +// func mulAvx2GFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvx2GFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x3_loop + VZEROUPPER + +mulAvx2GFNI_5x3_end: + RET + // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 @@ -9378,6 +18659,129 @@ mulGFNI_5x3_64Xor_loop: mulGFNI_5x3_64Xor_end: RET +// func mulAvx2GFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvx2GFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x3Xor_end: + RET + // func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64(SB), $0-88 @@ -9511,6 +18915,139 @@ mulGFNI_5x4_64_loop: mulGFNI_5x4_64_end: RET +// func mulAvx2GFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvx2GFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x4_loop + VZEROUPPER + +mulAvx2GFNI_5x4_end: + RET + // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 @@ -9654,6 +19191,149 @@ mulGFNI_5x4_64Xor_loop: mulGFNI_5x4_64Xor_end: RET +// func mulAvx2GFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvx2GFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x4Xor_end: + RET + // func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64(SB), $0-88 @@ -9805,6 +19485,157 @@ mulGFNI_5x5_64_loop: mulGFNI_5x5_64_end: RET +// func mulAvx2GFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvx2GFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x5_loop + VZEROUPPER + +mulAvx2GFNI_5x5_end: + RET + // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 @@ -9968,6 +19799,169 @@ mulGFNI_5x5_64Xor_loop: mulGFNI_5x5_64Xor_end: RET +// func mulAvx2GFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvx2GFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x5Xor_end: + RET + // func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64(SB), $0-88 @@ -10131,6 +20125,175 @@ mulGFNI_5x6_64_loop: mulGFNI_5x6_64_end: RET +// func mulAvx2GFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvx2GFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x6_loop + VZEROUPPER + +mulAvx2GFNI_5x6_end: + RET + // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 @@ -10308,6 +20471,189 @@ mulGFNI_5x6_64Xor_loop: mulGFNI_5x6_64Xor_end: RET +// func mulAvx2GFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvx2GFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x6Xor_end: + RET + // func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64(SB), $8-88 @@ -10483,6 +20829,193 @@ mulGFNI_5x7_64_loop: mulGFNI_5x7_64_end: RET +// func mulAvx2GFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvx2GFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x7_loop + VZEROUPPER + +mulAvx2GFNI_5x7_end: + RET + // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 @@ -10674,6 +21207,209 @@ mulGFNI_5x7_64Xor_loop: mulGFNI_5x7_64Xor_end: RET +// func mulAvx2GFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvx2GFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_5x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x7Xor_end: + RET + // func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64(SB), $8-88 @@ -10865,6 +21601,215 @@ mulGFNI_5x8_64_loop: mulGFNI_5x8_64_end: RET +// func mulAvx2GFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_5x8_loop + VZEROUPPER + +mulAvx2GFNI_5x8_end: + RET + // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 @@ -11074,6 +22019,233 @@ mulGFNI_5x8_64Xor_loop: mulGFNI_5x8_64Xor_end: RET +// func mulAvx2GFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_5x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x8Xor_end: + RET + // func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64(SB), $0-88 @@ -11254,6 +22426,210 @@ mulGFNI_5x9_64_loop: mulGFNI_5x9_64_end: RET +// func mulAvx2GFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x9_loop + VZEROUPPER + +mulAvx2GFNI_5x9_end: + RET + // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 @@ -11463,6 +22839,239 @@ mulGFNI_5x9_64Xor_loop: mulGFNI_5x9_64Xor_end: RET +// func mulAvx2GFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x9Xor_end: + RET + // func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64(SB), $0-88 @@ -11653,6 +23262,226 @@ mulGFNI_5x10_64_loop: mulGFNI_5x10_64_end: RET +// func mulAvx2GFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x10_loop + VZEROUPPER + +mulAvx2GFNI_5x10_end: + RET + // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 @@ -11875,6 +23704,258 @@ mulGFNI_5x10_64Xor_loop: mulGFNI_5x10_64Xor_end: RET +// func mulAvx2GFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvx2GFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvx2GFNI_5x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_5x10Xor_end: + RET + // func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64(SB), $0-88 @@ -11963,6 +24044,94 @@ mulGFNI_6x1_64_loop: mulGFNI_6x1_64_end: RET +// func mulAvx2GFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvx2GFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x1_loop + VZEROUPPER + +mulAvx2GFNI_6x1_end: + RET + // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 @@ -12055,6 +24224,98 @@ mulGFNI_6x1_64Xor_loop: mulGFNI_6x1_64Xor_end: RET +// func mulAvx2GFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvx2GFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x1Xor_end: + RET + // func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64(SB), $0-88 @@ -12164,6 +24425,115 @@ mulGFNI_6x2_64_loop: mulGFNI_6x2_64_end: RET +// func mulAvx2GFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvx2GFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x2_loop + VZEROUPPER + +mulAvx2GFNI_6x2_end: + RET + // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 @@ -12279,6 +24649,121 @@ mulGFNI_6x2_64Xor_loop: mulGFNI_6x2_64Xor_end: RET +// func mulAvx2GFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvx2GFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x2Xor_end: + RET + // func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64(SB), $0-88 @@ -12409,6 +24894,136 @@ mulGFNI_6x3_64_loop: mulGFNI_6x3_64_end: RET +// func mulAvx2GFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvx2GFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x3_loop + VZEROUPPER + +mulAvx2GFNI_6x3_end: + RET + // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 @@ -12547,6 +25162,144 @@ mulGFNI_6x3_64Xor_loop: mulGFNI_6x3_64Xor_end: RET +// func mulAvx2GFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvx2GFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x3Xor_end: + RET + // func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64(SB), $0-88 @@ -12698,6 +25451,157 @@ mulGFNI_6x4_64_loop: mulGFNI_6x4_64_end: RET +// func mulAvx2GFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvx2GFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x4_loop + VZEROUPPER + +mulAvx2GFNI_6x4_end: + RET + // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 @@ -12859,6 +25763,167 @@ mulGFNI_6x4_64Xor_loop: mulGFNI_6x4_64Xor_end: RET +// func mulAvx2GFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvx2GFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x4Xor_end: + RET + // func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64(SB), $0-88 @@ -13026,6 +26091,178 @@ mulGFNI_6x5_64_loop: mulGFNI_6x5_64_end: RET +// func mulAvx2GFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvx2GFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x5_loop + VZEROUPPER + +mulAvx2GFNI_6x5_end: + RET + // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 @@ -13205,6 +26442,190 @@ mulGFNI_6x5_64Xor_loop: mulGFNI_6x5_64Xor_end: RET +// func mulAvx2GFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvx2GFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x5Xor_end: + RET + // func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64(SB), $8-88 @@ -13386,6 +26807,199 @@ mulGFNI_6x6_64_loop: mulGFNI_6x6_64_end: RET +// func mulAvx2GFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvx2GFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x6_loop + VZEROUPPER + +mulAvx2GFNI_6x6_end: + RET + // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 @@ -13581,6 +27195,213 @@ mulGFNI_6x6_64Xor_loop: mulGFNI_6x6_64Xor_end: RET +// func mulAvx2GFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvx2GFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_6x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x6Xor_end: + RET + // func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64(SB), $8-88 @@ -13780,6 +27601,224 @@ mulGFNI_6x7_64_loop: mulGFNI_6x7_64_end: RET +// func mulAvx2GFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_6x7_loop + VZEROUPPER + +mulAvx2GFNI_6x7_end: + RET + // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 @@ -13995,6 +28034,240 @@ mulGFNI_6x7_64Xor_loop: mulGFNI_6x7_64Xor_end: RET +// func mulAvx2GFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_6x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x7Xor_end: + RET + // func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64(SB), $0-88 @@ -14187,6 +28460,224 @@ mulGFNI_6x8_64_loop: mulGFNI_6x8_64_end: RET +// func mulAvx2GFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x8_loop + VZEROUPPER + +mulAvx2GFNI_6x8_end: + RET + // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 @@ -14405,6 +28896,250 @@ mulGFNI_6x8_64Xor_loop: mulGFNI_6x8_64Xor_end: RET +// func mulAvx2GFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x8Xor_end: + RET + // func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64(SB), $0-88 @@ -14609,6 +29344,243 @@ mulGFNI_6x9_64_loop: mulGFNI_6x9_64_end: RET +// func mulAvx2GFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x9_loop + VZEROUPPER + +mulAvx2GFNI_6x9_end: + RET + // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 @@ -14842,6 +29814,272 @@ mulGFNI_6x9_64Xor_loop: mulGFNI_6x9_64Xor_end: RET +// func mulAvx2GFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x9Xor_end: + RET + // func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64(SB), $0-88 @@ -15058,6 +30296,262 @@ mulGFNI_6x10_64_loop: mulGFNI_6x10_64_end: RET +// func mulAvx2GFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x10_loop + VZEROUPPER + +mulAvx2GFNI_6x10_end: + RET + // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 @@ -15306,6 +30800,294 @@ mulGFNI_6x10_64Xor_loop: mulGFNI_6x10_64Xor_end: RET +// func mulAvx2GFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvx2GFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvx2GFNI_6x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_6x10Xor_end: + RET + // func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64(SB), $0-88 @@ -15403,6 +31185,103 @@ mulGFNI_7x1_64_loop: mulGFNI_7x1_64_end: RET +// func mulAvx2GFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvx2GFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x1_loop + VZEROUPPER + +mulAvx2GFNI_7x1_end: + RET + // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 @@ -15504,6 +31383,107 @@ mulGFNI_7x1_64Xor_loop: mulGFNI_7x1_64Xor_end: RET +// func mulAvx2GFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvx2GFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x1Xor_end: + RET + // func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64(SB), $0-88 @@ -15625,6 +31605,127 @@ mulGFNI_7x2_64_loop: mulGFNI_7x2_64_end: RET +// func mulAvx2GFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvx2GFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x2_loop + VZEROUPPER + +mulAvx2GFNI_7x2_end: + RET + // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 @@ -15752,6 +31853,133 @@ mulGFNI_7x2_64Xor_loop: mulGFNI_7x2_64Xor_end: RET +// func mulAvx2GFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvx2GFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x2Xor_end: + RET + // func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64(SB), $0-88 @@ -15897,6 +32125,151 @@ mulGFNI_7x3_64_loop: mulGFNI_7x3_64_end: RET +// func mulAvx2GFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvx2GFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x3_loop + VZEROUPPER + +mulAvx2GFNI_7x3_end: + RET + // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 @@ -16050,6 +32423,159 @@ mulGFNI_7x3_64Xor_loop: mulGFNI_7x3_64Xor_end: RET +// func mulAvx2GFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvx2GFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x3Xor_end: + RET + // func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64(SB), $0-88 @@ -16217,6 +32743,175 @@ mulGFNI_7x4_64_loop: mulGFNI_7x4_64_end: RET +// func mulAvx2GFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvx2GFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x4_loop + VZEROUPPER + +mulAvx2GFNI_7x4_end: + RET + // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 @@ -16394,6 +33089,185 @@ mulGFNI_7x4_64Xor_loop: mulGFNI_7x4_64Xor_end: RET +// func mulAvx2GFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvx2GFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x4Xor_end: + RET + // func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64(SB), $8-88 @@ -16577,6 +33451,199 @@ mulGFNI_7x5_64_loop: mulGFNI_7x5_64_end: RET +// func mulAvx2GFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvx2GFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x5_loop + VZEROUPPER + +mulAvx2GFNI_7x5_end: + RET + // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 @@ -16772,6 +33839,211 @@ mulGFNI_7x5_64Xor_loop: mulGFNI_7x5_64Xor_end: RET +// func mulAvx2GFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvx2GFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_7x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x5Xor_end: + RET + // func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64(SB), $8-88 @@ -16846,16 +34118,454 @@ TEXT ·mulGFNI_7x6_64(SB), $8-88 MOVQ n+80(FP), BP SHRQ $0x06, BP -mulGFNI_7x6_64_loop: +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulAvx2GFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_7x6_loop + VZEROUPPER + +mulAvx2GFNI_7x6_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 @@ -16969,64 +34679,48 @@ mulGFNI_7x6_64_loop: // Prepare for next loop DECQ BP - JNZ mulGFNI_7x6_64_loop + JNZ mulGFNI_7x6_64Xor_loop VZEROUPPER -mulGFNI_7x6_64_end: +mulGFNI_7x6_64Xor_end: RET -// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 - // Loading 24 of 42 tables to registers +// func mulAvx2GFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_7x6_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R14 - MOVQ 96(R10), R15 - MOVQ 120(R10), R10 - MOVQ start+72(FP), BP + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 @@ -17047,149 +34741,183 @@ TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 // Reload length to save a register MOVQ n+80(FP), BP - SHRQ $0x06, BP + SHRQ $0x05, BP -mulGFNI_7x6_64Xor_loop: +mulAvx2GFNI_7x6Xor_loop: // Load 6 outputs - VMOVDQU64 (R11), Z24 - VMOVDQU64 (R12), Z25 - VMOVDQU64 (R13), Z26 - VMOVDQU64 (R14), Z27 - VMOVDQU64 (R15), Z28 - VMOVDQU64 (R10), Z29 - - // Load and process 64 bytes from input 0 to 6 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 6 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 6 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 6 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 6 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 6 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 6 outputs - VMOVDQU64 (AX), Z30 - ADDQ $0x40, AX - VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 6 outputs - VMOVDQU64 Z24, (R11) - ADDQ $0x40, R11 - VMOVDQU64 Z25, (R12) - ADDQ $0x40, R12 - VMOVDQU64 Z26, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z27, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z28, (R15) - ADDQ $0x40, R15 - VMOVDQU64 Z29, (R10) - ADDQ $0x40, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 // Prepare for next loop DECQ BP - JNZ mulGFNI_7x6_64Xor_loop + JNZ mulAvx2GFNI_7x6Xor_loop VZEROUPPER -mulGFNI_7x6_64Xor_end: +mulAvx2GFNI_7x6Xor_end: RET // func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -17392,6 +35120,232 @@ mulGFNI_7x7_64_loop: mulGFNI_7x7_64_end: RET +// func mulAvx2GFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x7_loop + VZEROUPPER + +mulAvx2GFNI_7x7_end: + RET + // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 @@ -17615,6 +35569,255 @@ mulGFNI_7x7_64Xor_loop: mulGFNI_7x7_64Xor_end: RET +// func mulAvx2GFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x7Xor_end: + RET + // func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64(SB), $0-88 @@ -17829,6 +36032,254 @@ mulGFNI_7x8_64_loop: mulGFNI_7x8_64_end: RET +// func mulAvx2GFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x8_loop + VZEROUPPER + +mulAvx2GFNI_7x8_end: + RET + // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 @@ -18069,6 +36520,280 @@ mulGFNI_7x8_64Xor_loop: mulGFNI_7x8_64Xor_end: RET +// func mulAvx2GFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x8Xor_end: + RET + // func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64(SB), $0-88 @@ -18297,6 +37022,276 @@ mulGFNI_7x9_64_loop: mulGFNI_7x9_64_end: RET +// func mulAvx2GFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x9_loop + VZEROUPPER + +mulAvx2GFNI_7x9_end: + RET + // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 @@ -18554,6 +37549,305 @@ mulGFNI_7x9_64Xor_loop: mulGFNI_7x9_64Xor_end: RET +// func mulAvx2GFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x9Xor_end: + RET + // func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64(SB), $0-88 @@ -18796,6 +38090,298 @@ mulGFNI_7x10_64_loop: mulGFNI_7x10_64_end: RET +// func mulAvx2GFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x10_loop + VZEROUPPER + +mulAvx2GFNI_7x10_end: + RET + // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 @@ -19070,6 +38656,330 @@ mulGFNI_7x10_64Xor_loop: mulGFNI_7x10_64Xor_end: RET +// func mulAvx2GFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvx2GFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvx2GFNI_7x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_7x10Xor_end: + RET + // func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64(SB), $0-88 @@ -19116,11 +39026,227 @@ TEXT ·mulGFNI_8x1_64(SB), $0-88 ADDQ R12, R10 ADDQ R12, CX -mulGFNI_8x1_64_loop: +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulAvx2GFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvx2GFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x1_loop + VZEROUPPER + +mulAvx2GFNI_8x1_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z9 @@ -19170,44 +39296,44 @@ mulGFNI_8x1_64_loop: // Prepare for next loop DECQ AX - JNZ mulGFNI_8x1_64_loop + JNZ mulGFNI_8x1_64Xor_loop VZEROUPPER -mulGFNI_8x1_64_end: +mulGFNI_8x1_64Xor_end: RET -// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 +// func mulAvx2GFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_8x1_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), R9 - MOVQ 144(CX), R10 - MOVQ 168(CX), CX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 @@ -19222,68 +39348,68 @@ TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 ADDQ R12, R10 ADDQ R12, CX -mulGFNI_8x1_64Xor_loop: +mulAvx2GFNI_8x1Xor_loop: // Load 1 outputs - VMOVDQU64 (R11), Z8 - - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU64 (DX), Z9 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU64 (BX), Z9 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU64 (SI), Z9 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU64 (DI), Z9 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU64 (R8), Z9 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU64 (R9), Z9 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU64 (R10), Z9 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU64 (CX), Z9 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 - VXORPD Z8, Z9, Z8 + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 // Store 1 outputs - VMOVDQU64 Z8, (R11) - ADDQ $0x40, R11 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulGFNI_8x1_64Xor_loop + JNZ mulAvx2GFNI_8x1Xor_loop VZEROUPPER -mulGFNI_8x1_64Xor_end: +mulAvx2GFNI_8x1Xor_end: RET // func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -19419,6 +39545,139 @@ mulGFNI_8x2_64_loop: mulGFNI_8x2_64_end: RET +// func mulAvx2GFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvx2GFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x2_loop + VZEROUPPER + +mulAvx2GFNI_8x2_end: + RET + // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 @@ -19558,6 +39817,145 @@ mulGFNI_8x2_64Xor_loop: mulGFNI_8x2_64Xor_end: RET +// func mulAvx2GFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvx2GFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x2Xor_end: + RET + // func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64(SB), $0-88 @@ -19718,6 +40116,166 @@ mulGFNI_8x3_64_loop: mulGFNI_8x3_64_end: RET +// func mulAvx2GFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvx2GFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x3_loop + VZEROUPPER + +mulAvx2GFNI_8x3_end: + RET + // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 @@ -19886,6 +40444,174 @@ mulGFNI_8x3_64Xor_loop: mulGFNI_8x3_64Xor_end: RET +// func mulAvx2GFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvx2GFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x3Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x3Xor_end: + RET + // func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64(SB), $8-88 @@ -20067,6 +40793,193 @@ mulGFNI_8x4_64_loop: mulGFNI_8x4_64_end: RET +// func mulAvx2GFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvx2GFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x4_loop + VZEROUPPER + +mulAvx2GFNI_8x4_end: + RET + // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 @@ -20258,6 +41171,203 @@ mulGFNI_8x4_64Xor_loop: mulGFNI_8x4_64Xor_end: RET +// func mulAvx2GFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvx2GFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_8x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x4Xor_end: + RET + // func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64(SB), $8-88 @@ -20461,6 +41571,224 @@ mulGFNI_8x5_64_loop: mulGFNI_8x5_64_end: RET +// func mulAvx2GFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_8x5_loop + VZEROUPPER + +mulAvx2GFNI_8x5_end: + RET + // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 @@ -20676,6 +42004,236 @@ mulGFNI_8x5_64Xor_loop: mulGFNI_8x5_64Xor_end: RET +// func mulAvx2GFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_8x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x5Xor_end: + RET + // func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64(SB), $0-88 @@ -20880,6 +42438,234 @@ mulGFNI_8x6_64_loop: mulGFNI_8x6_64_end: RET +// func mulAvx2GFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x6_loop + VZEROUPPER + +mulAvx2GFNI_8x6_end: + RET + // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 @@ -21104,6 +42890,254 @@ mulGFNI_8x6_64Xor_loop: mulGFNI_8x6_64Xor_end: RET +// func mulAvx2GFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x6Xor_end: + RET + // func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64(SB), $0-88 @@ -21324,6 +43358,259 @@ mulGFNI_8x7_64_loop: mulGFNI_8x7_64_end: RET +// func mulAvx2GFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x7_loop + VZEROUPPER + +mulAvx2GFNI_8x7_end: + RET + // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 @@ -21567,6 +43854,282 @@ mulGFNI_8x7_64Xor_loop: mulGFNI_8x7_64Xor_end: RET +// func mulAvx2GFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x7Xor_end: + RET + // func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64(SB), $0-88 @@ -21803,6 +44366,284 @@ mulGFNI_8x8_64_loop: mulGFNI_8x8_64_end: RET +// func mulAvx2GFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x8_loop + VZEROUPPER + +mulAvx2GFNI_8x8_end: + RET + // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 @@ -22065,6 +44906,310 @@ mulGFNI_8x8_64Xor_loop: mulGFNI_8x8_64Xor_end: RET +// func mulAvx2GFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x8Xor_end: + RET + // func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64(SB), $0-88 @@ -22317,6 +45462,309 @@ mulGFNI_8x9_64_loop: mulGFNI_8x9_64_end: RET +// func mulAvx2GFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x9_loop + VZEROUPPER + +mulAvx2GFNI_8x9_end: + RET + // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 @@ -22598,6 +46046,338 @@ mulGFNI_8x9_64Xor_loop: mulGFNI_8x9_64Xor_end: RET +// func mulAvx2GFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x9Xor_end: + RET + // func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64(SB), $0-88 @@ -22866,6 +46646,334 @@ mulGFNI_8x10_64_loop: mulGFNI_8x10_64_end: RET +// func mulAvx2GFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x10_loop + VZEROUPPER + +mulAvx2GFNI_8x10_end: + RET + // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 @@ -23166,6 +47274,366 @@ mulGFNI_8x10_64Xor_loop: mulGFNI_8x10_64Xor_end: RET +// func mulAvx2GFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvx2GFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvx2GFNI_8x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_8x10Xor_end: + RET + // func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64(SB), $0-88 @@ -23281,6 +47749,121 @@ mulGFNI_9x1_64_loop: mulGFNI_9x1_64_end: RET +// func mulAvx2GFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvx2GFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x1_loop + VZEROUPPER + +mulAvx2GFNI_9x1_end: + RET + // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 @@ -23400,6 +47983,125 @@ mulGFNI_9x1_64Xor_loop: mulGFNI_9x1_64Xor_end: RET +// func mulAvx2GFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvx2GFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x1Xor_end: + RET + // func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64(SB), $0-88 @@ -23545,6 +48247,151 @@ mulGFNI_9x2_64_loop: mulGFNI_9x2_64_end: RET +// func mulAvx2GFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvx2GFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x2_loop + VZEROUPPER + +mulAvx2GFNI_9x2_end: + RET + // func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 @@ -23696,6 +48543,157 @@ mulGFNI_9x2_64Xor_loop: mulGFNI_9x2_64Xor_end: RET +// func mulAvx2GFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvx2GFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x2Xor_end: + RET + // func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64(SB), $0-88 @@ -23767,13 +48765,371 @@ TEXT ·mulGFNI_9x3_64(SB), $0-88 ADDQ R15, R11 ADDQ R15, CX -mulGFNI_9x3_64_loop: +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulAvx2GFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvx2GFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_9x3_loop + VZEROUPPER + +mulAvx2GFNI_9x3_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 @@ -23865,193 +49221,193 @@ mulGFNI_9x3_64_loop: // Prepare for next loop DECQ AX - JNZ mulGFNI_9x3_64_loop + JNZ mulGFNI_9x3_64Xor_loop VZEROUPPER -mulGFNI_9x3_64_end: +mulGFNI_9x3_64Xor_end: RET -// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 - // Loading all tables to registers +// func mulAvx2GFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_9x3_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - VBROADCASTF32X2 192(CX), Z24 - VBROADCASTF32X2 200(CX), Z25 - VBROADCASTF32X2 208(CX), Z26 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), R9 - MOVQ 144(CX), R10 - MOVQ 168(CX), R11 - MOVQ 192(CX), CX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R15, DX - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, CX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX -mulGFNI_9x3_64Xor_loop: +mulAvx2GFNI_9x3Xor_loop: // Load 3 outputs - VMOVDQU64 (R13), Z27 - VMOVDQU64 (R14), Z28 - VMOVDQU64 (R12), Z29 - - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU64 (R10), Z30 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU64 (R11), Z30 - ADDQ $0x40, R11 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU64 (CX), Z30 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU64 Z27, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z28, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z29, (R12) - ADDQ $0x40, R12 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulGFNI_9x3_64Xor_loop + JNZ mulAvx2GFNI_9x3Xor_loop VZEROUPPER -mulGFNI_9x3_64Xor_end: +mulAvx2GFNI_9x3Xor_end: RET // func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -24253,6 +49609,215 @@ mulGFNI_9x4_64_loop: mulGFNI_9x4_64_end: RET +// func mulAvx2GFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_9x4_loop + VZEROUPPER + +mulAvx2GFNI_9x4_end: + RET + // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 @@ -24462,6 +50027,225 @@ mulGFNI_9x4_64Xor_loop: mulGFNI_9x4_64Xor_end: RET +// func mulAvx2GFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_9x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x4Xor_end: + RET + // func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64(SB), $0-88 @@ -24666,6 +50450,230 @@ mulGFNI_9x5_64_loop: mulGFNI_9x5_64_end: RET +// func mulAvx2GFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x5_loop + VZEROUPPER + +mulAvx2GFNI_9x5_end: + RET + // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 @@ -24887,6 +50895,247 @@ mulGFNI_9x5_64Xor_loop: mulGFNI_9x5_64Xor_end: RET +// func mulAvx2GFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x5Xor_end: + RET + // func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64(SB), $0-88 @@ -25109,6 +51358,258 @@ mulGFNI_9x6_64_loop: mulGFNI_9x6_64_end: RET +// func mulAvx2GFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x6_loop + VZEROUPPER + +mulAvx2GFNI_9x6_end: + RET + // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 @@ -25351,6 +51852,278 @@ mulGFNI_9x6_64Xor_loop: mulGFNI_9x6_64Xor_end: RET +// func mulAvx2GFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x6Xor_end: + RET + // func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64(SB), $0-88 @@ -25591,6 +52364,286 @@ mulGFNI_9x7_64_loop: mulGFNI_9x7_64_end: RET +// func mulAvx2GFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x7_loop + VZEROUPPER + +mulAvx2GFNI_9x7_end: + RET + // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 @@ -25854,6 +52907,309 @@ mulGFNI_9x7_64Xor_loop: mulGFNI_9x7_64Xor_end: RET +// func mulAvx2GFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x7Xor_end: + RET + // func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64(SB), $0-88 @@ -26112,6 +53468,314 @@ mulGFNI_9x8_64_loop: mulGFNI_9x8_64_end: RET +// func mulAvx2GFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x8_loop + VZEROUPPER + +mulAvx2GFNI_9x8_end: + RET + // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 @@ -26396,6 +54060,340 @@ mulGFNI_9x8_64Xor_loop: mulGFNI_9x8_64Xor_end: RET +// func mulAvx2GFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x8Xor_end: + RET + // func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64(SB), $0-88 @@ -26672,6 +54670,342 @@ mulGFNI_9x9_64_loop: mulGFNI_9x9_64_end: RET +// func mulAvx2GFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x9_loop + VZEROUPPER + +mulAvx2GFNI_9x9_end: + RET + // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 @@ -26977,6 +55311,371 @@ mulGFNI_9x9_64Xor_loop: mulGFNI_9x9_64Xor_end: RET +// func mulAvx2GFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x9Xor_end: + RET + // func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64(SB), $0-88 @@ -27271,6 +55970,370 @@ mulGFNI_9x10_64_loop: mulGFNI_9x10_64_end: RET +// func mulAvx2GFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x10_loop + VZEROUPPER + +mulAvx2GFNI_9x10_end: + RET + // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 @@ -27597,6 +56660,402 @@ mulGFNI_9x10_64Xor_loop: mulGFNI_9x10_64Xor_end: RET +// func mulAvx2GFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvx2GFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvx2GFNI_9x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_9x10Xor_end: + RET + // func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64(SB), $0-88 @@ -27721,6 +57180,130 @@ mulGFNI_10x1_64_loop: mulGFNI_10x1_64_end: RET +// func mulAvx2GFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvx2GFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x1_loop + VZEROUPPER + +mulAvx2GFNI_10x1_end: + RET + // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 @@ -27849,6 +57432,134 @@ mulGFNI_10x1_64Xor_loop: mulGFNI_10x1_64Xor_end: RET +// func mulAvx2GFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvx2GFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x1Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x1Xor_end: + RET + // func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64(SB), $0-88 @@ -28006,6 +57717,163 @@ mulGFNI_10x2_64_loop: mulGFNI_10x2_64_end: RET +// func mulAvx2GFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvx2GFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x2_loop + VZEROUPPER + +mulAvx2GFNI_10x2_end: + RET + // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 @@ -28169,6 +58037,169 @@ mulGFNI_10x2_64Xor_loop: mulGFNI_10x2_64Xor_end: RET +// func mulAvx2GFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvx2GFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvx2GFNI_10x2Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x2Xor_end: + RET + // func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64(SB), $8-88 @@ -28246,13 +58277,406 @@ TEXT ·mulGFNI_10x3_64(SB), $8-88 MOVQ n+80(FP), BP SHRQ $0x06, BP -mulGFNI_10x3_64_loop: +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulAvx2GFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvx2GFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvx2GFNI_10x3_loop + VZEROUPPER + +mulAvx2GFNI_10x3_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 @@ -28354,67 +58778,51 @@ mulGFNI_10x3_64_loop: // Prepare for next loop DECQ BP - JNZ mulGFNI_10x3_64_loop + JNZ mulGFNI_10x3_64Xor_loop VZEROUPPER -mulGFNI_10x3_64_end: +mulGFNI_10x3_64Xor_end: RET -// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 - // Loading 27 of 30 tables to registers +// func mulAvx2GFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_10x3_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - VBROADCASTF32X2 192(CX), Z24 - VBROADCASTF32X2 200(CX), Z25 - VBROADCASTF32X2 208(CX), Z26 - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -28435,128 +58843,147 @@ TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 // Reload length to save a register MOVQ n+80(FP), BP - SHRQ $0x06, BP + SHRQ $0x05, BP -mulGFNI_10x3_64Xor_loop: +mulAvx2GFNI_10x3Xor_loop: // Load 3 outputs - VMOVDQU64 (R14), Z27 - VMOVDQU64 (R15), Z28 - VMOVDQU64 (R13), Z29 - - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU64 (R10), Z30 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU64 (R11), Z30 - ADDQ $0x40, R11 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU64 (R12), Z30 - ADDQ $0x40, R12 - VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 9 to 3 outputs - VMOVDQU64 (AX), Z30 - ADDQ $0x40, AX - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU64 Z27, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z28, (R15) - ADDQ $0x40, R15 - VMOVDQU64 Z29, (R13) - ADDQ $0x40, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ BP - JNZ mulGFNI_10x3_64Xor_loop + JNZ mulAvx2GFNI_10x3Xor_loop VZEROUPPER -mulGFNI_10x3_64Xor_end: +mulAvx2GFNI_10x3Xor_end: RET // func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -28759,6 +59186,220 @@ mulGFNI_10x4_64_loop: mulGFNI_10x4_64_end: RET +// func mulAvx2GFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x4_loop + VZEROUPPER + +mulAvx2GFNI_10x4_end: + RET + // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 @@ -28973,6 +59614,234 @@ mulGFNI_10x4_64Xor_loop: mulGFNI_10x4_64Xor_end: RET +// func mulAvx2GFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x4Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x4Xor_end: + RET + // func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64(SB), $8-88 @@ -29193,6 +60062,251 @@ mulGFNI_10x5_64_loop: mulGFNI_10x5_64_end: RET +// func mulAvx2GFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x5_loop + VZEROUPPER + +mulAvx2GFNI_10x5_end: + RET + // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 @@ -29430,6 +60544,268 @@ mulGFNI_10x5_64Xor_loop: mulGFNI_10x5_64Xor_end: RET +// func mulAvx2GFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x5Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x5Xor_end: + RET + // func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64(SB), $8-88 @@ -29670,6 +61046,282 @@ mulGFNI_10x6_64_loop: mulGFNI_10x6_64_end: RET +// func mulAvx2GFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x6_loop + VZEROUPPER + +mulAvx2GFNI_10x6_end: + RET + // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 @@ -29930,6 +61582,302 @@ mulGFNI_10x6_64Xor_loop: mulGFNI_10x6_64Xor_end: RET +// func mulAvx2GFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x6Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x6Xor_end: + RET + // func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64(SB), $8-88 @@ -30190,6 +62138,313 @@ mulGFNI_10x7_64_loop: mulGFNI_10x7_64_end: RET +// func mulAvx2GFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x7_loop + VZEROUPPER + +mulAvx2GFNI_10x7_end: + RET + // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 @@ -30473,6 +62728,336 @@ mulGFNI_10x7_64Xor_loop: mulGFNI_10x7_64Xor_end: RET +// func mulAvx2GFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x7Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x7Xor_end: + RET + // func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64(SB), $8-88 @@ -30753,6 +63338,344 @@ mulGFNI_10x8_64_loop: mulGFNI_10x8_64_end: RET +// func mulAvx2GFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x8_loop + VZEROUPPER + +mulAvx2GFNI_10x8_end: + RET + // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 @@ -31059,6 +63982,370 @@ mulGFNI_10x8_64Xor_loop: mulGFNI_10x8_64Xor_end: RET +// func mulAvx2GFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x8Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x8Xor_end: + RET + // func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64(SB), $8-88 @@ -31359,6 +64646,375 @@ mulGFNI_10x9_64_loop: mulGFNI_10x9_64_end: RET +// func mulAvx2GFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x9_loop + VZEROUPPER + +mulAvx2GFNI_10x9_end: + RET + // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 @@ -31688,6 +65344,404 @@ mulGFNI_10x9_64Xor_loop: mulGFNI_10x9_64Xor_end: RET +// func mulAvx2GFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x9Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x9Xor_end: + RET + // func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64(SB), $8-88 @@ -32008,6 +66062,406 @@ mulGFNI_10x10_64_loop: mulGFNI_10x10_64_end: RET +// func mulAvx2GFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD (CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD (CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD (CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD (CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD (CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD (CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x10_loop + VZEROUPPER + +mulAvx2GFNI_10x10_end: + RET + // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 @@ -32360,6 +66814,438 @@ mulGFNI_10x10_64Xor_loop: mulGFNI_10x10_64Xor_end: RET +// func mulAvx2GFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvx2GFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvx2GFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvx2GFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD (CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 8(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 16(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 24(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvx2GFNI_10x10Xor_loop + VZEROUPPER + +mulAvx2GFNI_10x10Xor_end: + RET + // func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index 28c50658..2829969b 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -14,687 +14,686 @@ const ( maxAvx2Inputs = 10 maxAvx2Outputs = 10 minAvx2Size = 64 - avxSizeMask = maxInt - (minAvx2Size - 1) ) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := stop - start switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -1032,7 +1031,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -1368,3 +1367,679 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2GFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvx2GFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvx2GFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvx2GFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvx2GFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvx2GFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvx2GFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvx2GFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvx2GFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvx2GFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvx2GFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvx2GFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvx2GFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvx2GFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvx2GFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvx2GFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvx2GFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvx2GFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvx2GFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvx2GFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvx2GFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvx2GFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go index 888df307..d9b7026d 100644 --- a/galois_gen_switch_nopshufb_amd64.go +++ b/galois_gen_switch_nopshufb_amd64.go @@ -14,14 +14,13 @@ const ( maxAvx2Inputs = 10 maxAvx2Outputs = 10 minAvx2Size = 64 - avxSizeMask = maxInt - (minAvx2Size - 1) ) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -359,7 +358,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -695,3 +694,679 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2GFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvx2GFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvx2GFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvx2GFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvx2GFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvx2GFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvx2GFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvx2GFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvx2GFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvx2GFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvx2GFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvx2GFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvx2GFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvx2GFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvx2GFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvx2GFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvx2GFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvx2GFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvx2GFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvx2GFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvx2GFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvx2GFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvx2GFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvx2GFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvx2GFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvx2GFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvx2GFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvx2GFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvx2GFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvx2GFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvx2GFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/options.go b/options.go index f74fe00f..2a2e5204 100644 --- a/options.go +++ b/options.go @@ -2,6 +2,7 @@ package reedsolomon import ( "runtime" + "strings" "github.com/klauspost/cpuid/v2" ) @@ -15,15 +16,21 @@ type options struct { shardSize int perRound int - useGFNI, useAVX512, useAVX2, useSSSE3, useSSE2 bool - useJerasureMatrix bool - usePAR1Matrix bool - useCauchy bool - fastOneParity bool - inversionCache bool - forcedInversionCache bool - customMatrix [][]byte - withLeopard leopardMode + useAvx2GNFI, + useAvx512GFNI, + useAVX512, + useAVX2, + useSSSE3, + useSSE2 bool + + useJerasureMatrix bool + usePAR1Matrix bool + useCauchy bool + fastOneParity bool + inversionCache bool + forcedInversionCache bool + customMatrix [][]byte + withLeopard leopardMode // stream options concReads bool @@ -38,11 +45,12 @@ var defaultOptions = options{ inversionCache: true, // Detect CPU capabilities. - useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), - useSSE2: cpuid.CPU.Supports(cpuid.SSE2), - useAVX2: cpuid.CPU.Supports(cpuid.AVX2), - useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), - useGFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), + useSSE2: cpuid.CPU.Supports(cpuid.SSE2), + useAVX2: cpuid.CPU.Supports(cpuid.AVX2), + useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), + useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useAvx2GNFI: cpuid.CPU.Supports(cpuid.AVX2, cpuid.GFNI), } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -163,6 +171,9 @@ func WithSSSE3(enabled bool) Option { func WithAVX2(enabled bool) Option { return func(o *options) { o.useAVX2 = enabled + if o.useAvx2GNFI { + o.useAvx2GNFI = enabled + } } } @@ -178,7 +189,7 @@ func WithSSE2(enabled bool) Option { func WithAVX512(enabled bool) Option { return func(o *options) { o.useAVX512 = enabled - o.useGFNI = enabled + o.useAvx512GFNI = enabled } } @@ -186,7 +197,15 @@ func WithAVX512(enabled bool) Option { // If not set, GFNI will be turned on or off automatically based on CPU ID information. func WithGFNI(enabled bool) Option { return func(o *options) { - o.useGFNI = enabled + o.useAvx512GFNI = enabled + } +} + +// WithAVX2GFNI allows to enable/disable GFNI with AVX2 instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithAVX2GFNI(enabled bool) Option { + return func(o *options) { + o.useAvx2GNFI = enabled } } @@ -275,3 +294,26 @@ func WithLeopardGF(enabled bool) Option { } } } + +func (o *options) cpuOptions() string { + var res []string + if o.useSSE2 { + res = append(res, "SSE2") + } + if o.useAVX2 { + res = append(res, "AVX2") + } + if o.useSSSE3 { + res = append(res, "SSSE3") + } + if o.useAVX512 { + res = append(res, "AVX512") + } + if o.useAvx512GFNI { + res = append(res, "AVX512+GFNI") + } + if o.useAvx2GNFI { + res = append(res, "AVX2+GFNI") + } + return strings.Join(res, ",") +} diff --git a/reedsolomon.go b/reedsolomon.go index 55b56650..81dcd4d8 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -653,12 +653,12 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useGFNI) { + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvx2GNFI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] } - if r.o.useGFNI { + if r.o.useAvx512GFNI || r.o.useAvx2GNFI { r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) } else { r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) @@ -810,7 +810,7 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { } func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useGFNI && + return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvx2GNFI) && byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } @@ -841,7 +841,11 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if r.canGFNI(byteCount, len(inputs), len(outputs)) { var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + } else { + start += galMulSlicesAvx2GFNI(m, inputs, outputs, 0, byteCount) + } end = len(inputs[0]) } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) @@ -867,22 +871,28 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if len(outPer) > maxAvx2Outputs { outPer = outPer[:maxAvx2Outputs] } - if r.o.useGFNI { + if r.o.useAvx512GFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) + start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + } + } else if r.o.useAvx2GNFI { + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) + if inIdx == 0 { + start = galMulSlicesAvx2GFNI(m, inPer, outPer, 0, byteCount) + } else { + start = galMulSlicesAvx2GFNIXor(m, inPer, outPer, 0, byteCount) } } else { m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) if inIdx == 0 { - galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) } } - start = byteCount & avxSizeMask outIdx += len(outPer) outs = outs[len(outPer):] } @@ -928,7 +938,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte } else if useAvx2 { avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) defer r.putTmpSlice(avx2Matrix) - } else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + } else if (r.o.useAvx512GFNI || r.o.useAvx2GNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { // It appears there is a switchover point at around 10MB where // Regular processing is faster... @@ -950,7 +960,11 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + } else { + start += galMulSlicesAvx2GFNI(gfniMatrix, inputs, outputs, start, stop) + } } else if useAvx2 { start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) } @@ -1099,14 +1113,15 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b for lstart < stop { if lstop-lstart >= minAvx2Size { // Execute plan... + var n int for _, p := range plan { if p.first { - galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) } else { - galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { @@ -1248,14 +1263,25 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b for lstart < stop { if lstop-lstart >= minAvx2Size { // Execute plan... - for _, p := range plan { - if p.first { - galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + var n int + if r.o.useAvx512GFNI { + for _, p := range plan { + if p.first { + n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + } + } + } else { + for _, p := range plan { + if p.first { + n = galMulSlicesAvx2GFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesAvx2GFNIXor(p.m, p.input, p.output, lstart, lstop) + } } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { diff --git a/reedsolomon_test.go b/reedsolomon_test.go index a1745a35..625fb977 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -24,10 +24,17 @@ var noSSE2 = flag.Bool("no-sse2", !defaultOptions.useSSE2, "Disable SSE2") var noSSSE3 = flag.Bool("no-ssse3", !defaultOptions.useSSSE3, "Disable SSSE3") var noAVX2 = flag.Bool("no-avx2", !defaultOptions.useAVX2, "Disable AVX2") var noAVX512 = flag.Bool("no-avx512", !defaultOptions.useAVX512, "Disable AVX512") -var noGNFI = flag.Bool("no-gfni", !defaultOptions.useGFNI, "Disable AVX512+GFNI") +var noGNFI = flag.Bool("no-gfni", !defaultOptions.useAvx512GFNI, "Disable AVX512+GFNI") +var noAVX2GNFI = flag.Bool("no-avx2-gfni", !defaultOptions.useAvx512GFNI, "Disable AVX2+GFNI") func TestMain(m *testing.M) { flag.Parse() + rs, _ := New(10, 3, testOptions()...) + if rs != nil { + if rst, ok := rs.(*reedSolomon); ok { + fmt.Println("Using", rst.o.cpuOptions()) + } + } os.Exit(m.Run()) } @@ -48,6 +55,9 @@ func testOptions(o ...Option) []Option { if *noGNFI { o = append(o, WithGFNI(false)) } + if *noAVX2GNFI { + o = append(o, WithAVX2GFNI(false)) + } return o } @@ -204,7 +214,7 @@ func testOpts() [][]Option { n = append(n, WithAVX512(true)) opts = append(opts, n) } - if defaultOptions.useGFNI { + if defaultOptions.useAvx512GFNI { n := make([]Option, len(o), len(o)+1) copy(n, o) n = append(n, WithGFNI(false))