From d22fb73ab620f3aeb81c3855a6bd39e7b93be182 Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Tue, 11 Jun 2024 14:08:53 -0700 Subject: [PATCH] Add code to generate the SVE and NEON routines for ARM --- _gen/gen-arm-neon.go | 415 ++++++++++++ _gen/gen-arm-sve.go | 293 ++++++++ _gen/gen.go | 24 +- _gen/go.mod | 4 +- _gen/go.sum | 4 + galois_gen_amd64.s | 1512 ++++++++++++++++++++++-------------------- galois_gen_arm64.go | 3 + galois_gen_arm64.s | 175 +++-- 8 files changed, 1641 insertions(+), 789 deletions(-) create mode 100644 _gen/gen-arm-neon.go create mode 100644 _gen/gen-arm-sve.go diff --git a/_gen/gen-arm-neon.go b/_gen/gen-arm-neon.go new file mode 100644 index 00000000..00fa7817 --- /dev/null +++ b/_gen/gen-arm-neon.go @@ -0,0 +1,415 @@ +// Copyright 2024, Klaus Post/Minio Inc. See LICENSE for details. + +package main + +import ( + "bytes" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +func convert2Neon(asmBuf *bytes.Buffer, lines []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + reAddrMode := regexp.MustCompile(`\[(.*?)\]`) // regexp to match content between square brackets + + getZregister := func(reg string) int { + if reg[0] == 'z' { + reg = strings.NewReplacer(",", "", ".d", "", ".b", "", "[0]", "").Replace(reg[1:]) + num, err := strconv.Atoi(reg) + if err != nil { + panic(err) + } + return num + } + return -1 + } + + getXregister := func(reg string) int { + if reg[0] == 'x' { + reg = strings.ReplaceAll(reg, ",", "") + num, err := strconv.Atoi(reg[1:]) + if err != nil { + panic(err) + } + return num + } + return -1 + } + + getHashImm := func(imm string) int { + if imm[0] == '#' { + num, err := strconv.Atoi(imm[1:]) + if err != nil { + panic(err) + } + return num + } else { + panic("bad immediate") + } + } + + parseAddrModeMulVl := func(addrMode string) string { + addrMode = strings.NewReplacer("[", "", "]", "").Replace(addrMode) + f := strings.Fields(addrMode) + xn, offset := getXregister(f[0]), "" + if len(f) > 1 { + if len(f) == 4 && f[2] == "MUL" && f[3] == "VL" { + num, err := strconv.Atoi(strings.NewReplacer("#", "", ",", "").Replace(f[1])) + if err != nil { + panic(err) + } + offset = fmt.Sprintf("%d", num*32) + } else { + panic("bad addressing mode") + } + } + + return fmt.Sprintf("%s(R%d)", offset, xn) + } + + parseAddrModeIndexed := func(addrMode string) (string, string) { + addrMode = strings.NewReplacer("[", "", "]", "").Replace(addrMode) + f := strings.Fields(addrMode) + regbase := getXregister(f[0]) + regshifted := getXregister(f[1]) + shift := getHashImm(f[3]) + return fmt.Sprintf("(R%d)", regbase), fmt.Sprintf("R%d<<%d", regshifted, shift) + } + + for _, line := range lines { + if strings.Contains(line, " // ldr z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + var am string + { // HACK: ignore offset since we're fixating it at 32 + baseReg := strings.Split(matches[0][0], ",")[0] + am = parseAddrModeMulVl(baseReg) + } + asmF(" VLD1.P 32%s, [V%d.B16, V%d.B16]\n", am, zd*2, zd*2+1) + } else { + panic("bad 'ldr' instrunction") + } + } else if strings.Contains(line, " // str z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + var am string + { // HACK: ignore offset since we're fixating it at 32 + baseReg := strings.Split(matches[0][0], ",")[0] + am = parseAddrModeMulVl(baseReg) + } + asmF(" VST1.P [V%d.D2, V%d.D2], 32%s\n", zd*2, zd*2+1, am) + } else { + panic("bad 'str' instrunction") + } + } else if strings.Contains(line, " // ld1d { z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[5]) + base, shifted := parseAddrModeIndexed(matches[0][0]) + asmF(" ADD %s, %s\n", shifted, strings.NewReplacer("(", "", ")", "").Replace(base)) + asmF(" VLD1 %s, [V%d.B16, V%d.B16]\n", base, zd*2, zd*2+1) + } else { + panic("bad 'ld1d' instrunction") + } + } else if strings.Contains(line, " // st1d { z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[5]) + base, shifted := parseAddrModeIndexed(matches[0][0]) + asmF(" ADD %s, %s\n", shifted, strings.NewReplacer("(", "", ")", "").Replace(base)) + asmF(" VST1 [V%d.D2, V%d.D2], %s\n", zd*2, zd*2+1, base) + } else { + panic("bad 'st1d' instrunction") + } + } else if strings.Contains(line, " // lsr z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + imm := getHashImm(f[len(f)-1]) + asmF(" VUSHR $%d, V%d.B16, V%d.B16\n", imm, zn*2, zd*2) + asmF(" VUSHR $%d, V%d.B16, V%d.B16\n", imm, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // and z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VAND V%d.B16, V%d.B16, V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VAND V%d.B16, V%d.B16, V%d.B16\n", zn2*2 /*+1*/, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // tbl z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VTBL V%d.B16, [V%d.B16], V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VTBL V%d.B16, [V%d.B16], V%d.B16\n", zn2*2+1, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // eor z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VEOR V%d.B16, V%d.B16, V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VEOR V%d.B16, V%d.B16, V%d.B16\n", zn2*2+1, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // mov z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + xn := getXregister(f[len(f)-1]) + asmF(" VMOV R%d, V%d.B[0]\n", xn, zd*2) + } else if strings.Contains(line, " // dup z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + zn := getZregister(f[len(f)-1]) + asmF(" VDUP V%d.B[0], V%d.B16\n", zn*2, zd*2) + } else if strings.Contains(line, " // add x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + if f[len(f)-1][0] == '#' { + imm := getHashImm(f[len(f)-1]) + asmF(" ADD $%d, R%d\n", imm, xd) + } else { + xn := getXregister(f[len(f)-1]) + asmF(" ADD R%d, R%d\n", xn, xd) + } + } else if strings.Contains(line, " // subs x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + imm := getHashImm(f[len(f)-1]) + asmF(" SUBS $%d, R%d\n", imm, xd) + } else if strings.Contains(line, " // lsr x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + imm := getHashImm(f[len(f)-1]) + asmF(" LSR $%d, R%d\n", imm, xd) + } else if strings.Contains(line, " // tst x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-2]) + xn := getXregister(f[len(f)-1]) + asmF(" TST R%d, R%d\n", xn, xd) + } else if strings.Contains(line, " // mov x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-2]) + imm := getHashImm(f[len(f)-1]) + asmF(" MOVD $%d, R%d\n", imm, xd) + } else if strings.HasSuffix(line, ":") || + strings.HasPrefix(line, " BEQ") || + strings.HasPrefix(line, " BNE") || + strings.HasPrefix(line, "TEXT ·mulSve") || + strings.HasPrefix(line, "// func mulSve") { + line = strings.ReplaceAll(line, "Sve", "Neon") + asmF("%s\n", line) + } else if strings.Contains(line, "Requires: SVE") { + line = strings.ReplaceAll(line, "SVE", "NEON") + asmF("%s\n", line) + } else if strings.Contains(line, " // ptrue p") { + // intentionally drop line + } else if strings.HasPrefix(line, " // ") || + strings.HasPrefix(line, " MOVD ") || + strings.HasPrefix(line, " CMP ") || + strings.HasPrefix(line, " RET") || + len(line) == 0 { + asmF("%s\n", line) + } else { + panic(fmt.Sprintf("convert2Neon unsupported: `%s`", line)) + } + } +} + +func fixPostIncrementNeon(asmBuf *bytes.Buffer, lines []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + const MATRIX_BASE = "matrix_base" + + skipResetMatrixBase := false + { + routine := strings.Join(lines, "\n") + reFramePtr := regexp.MustCompile(`MOVD\s*` + MATRIX_BASE + `\+\d*(\(FP\),\s*R\d*)`) + + if matches := reFramePtr.FindAllStringSubmatch(routine, -1); len(matches) == 1 { + framePtrToDest := matches[0][1] + + // check if we're loading into register + // more than once from the stack frame + // (meaning we overwrite the 'matrix_base' value) + escaped := strings.NewReplacer("(", `\(`, ")", `\)`).Replace(framePtrToDest) + reSameDest := regexp.MustCompile(`MOVD\s*\w*\+\d*` + escaped) + if m := reSameDest.FindAllStringSubmatch(routine, -1); len(m) == 2 { + skipResetMatrixBase = true + } + } + } + + isXor := false + { + routine := strings.Join(lines, "\n") + isXor = strings.Count(routine, "Xor(SB)") > 0 + } + + resetMatrixBaseAtStartOfLoop := "" + for i := 0; i < len(lines); i++ { + + if !skipResetMatrixBase { + // + // Since we are loading with post-increment, + // reset register holding matrix array at + // start of each loop + // + if strings.Contains(lines[i], MATRIX_BASE) { + resetMatrixBaseAtStartOfLoop = lines[i] + continue + } else if strings.HasSuffix(lines[i], "_loop:") { + asmF("%s\n", lines[i]) + asmF("%s\n", resetMatrixBaseAtStartOfLoop) + resetMatrixBaseAtStartOfLoop = "" + continue + } + } + + // + // Remove the explicit ADDition of the + // pointer to the shard (since we are already + // using post-increments for the loads/stores) + // + if i < len(lines)-1 && + strings.Contains(lines[i], "32(R") && + strings.Contains(lines[i+1], "ADD") && strings.Contains(lines[i+1], "$32, R") { + + storing := strings.Contains(lines[i], "VST1.P") + if storing && isXor { + // move post-increment into a "pre-decrement" to offset + // post-increment for loading of existing content in case of Xor-case + asmF("%s\n", strings.ReplaceAll(lines[i+1], "ADD", "SUB")) + asmF("%s\n", lines[i]) + } else { + asmF("%s\n", lines[i]) + // intentionally skip line with ADD + } + i += 1 + continue + } + if i < len(lines)-2 && + strings.Contains(lines[i], "32(R") && + strings.Contains(lines[i+1], "32(R") && + strings.Contains(lines[i+2], "ADD") && strings.Contains(lines[i+2], "$64, R") { + + storing := strings.Contains(lines[i], "VST1.P") && strings.Contains(lines[i+1], "VST1.P") + if storing && isXor { + // move post-increment into a "pre-decrement" to offset + // post-increment for loading of existing content in case of Xor-case + asmF("%s\n", strings.ReplaceAll(lines[i+2], "ADD", "SUB")) + asmF("%s\n", lines[i]) + asmF("%s\n", lines[i+1]) + } else { + asmF("%s\n", lines[i]) + asmF("%s\n", lines[i+1]) + // intentionally skip line with ADD + } + i += 2 + continue + } + + asmF("%s\n", lines[i]) + } +} + +func genArmNeon() { + const SVE_CODE = "../galois_gen_arm64.s" + + asmOut, goOut := &bytes.Buffer{}, &bytes.Buffer{} + + if asmSve, err := os.ReadFile(SVE_CODE); err != nil { + log.Fatalf("Failed to read %s: %v", SVE_CODE, err) + } else { + // start with SVE code + asmOut.WriteString(string(asmSve)) + } + if goSve, err := os.ReadFile(strings.ReplaceAll(SVE_CODE, ".s", ".go")); err != nil { + log.Fatalf("Failed to read %s: %v", SVE_CODE, err) + } else { + goOut.WriteString(string(goSve)) + } + + const input = 10 + + // Processing 64 bytes variants + for output := 1; output <= 3; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulSve_%dx%d_64%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "Sve", "Neon")) + + lines, err := extractRoutine(SVE_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: NEON\n") + + { + asmTemp := &bytes.Buffer{} + convert2Neon(asmTemp, lines) + fixPostIncrementNeon(asmOut, strings.Split(string(asmTemp.Bytes()), "\n")) + } + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + // Processing 32 bytes variants + for output := 4; output <= 10; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulSve_%dx%d%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "Sve", "Neon")) + + lines, err := extractRoutine(SVE_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: NEON\n") + + { + asmTemp := &bytes.Buffer{} + convert2Neon(asmTemp, lines) + fixPostIncrementNeon(asmOut, strings.Split(string(asmTemp.Bytes()), "\n")) + } + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } + if err := os.WriteFile("../galois_gen_arm64.go", goOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} diff --git a/_gen/gen-arm-sve.go b/_gen/gen-arm-sve.go new file mode 100644 index 00000000..1c1e2217 --- /dev/null +++ b/_gen/gen-arm-sve.go @@ -0,0 +1,293 @@ +// Copyright 2024, Klaus Post/Minio Inc. See LICENSE for details. + +package main + +import ( + "bufio" + "bytes" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" + + avxtwo2sve "github.com/fwessels/avxTwo2sve" + sve_as "github.com/fwessels/sve-as" +) + +func patchLabel(line string) string { + return strings.ReplaceAll(line, "AvxTwo", "Sve") +} + +func extractRoutine(filename, routine string) (lines []string, err error) { + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + + // Create a scanner to read the file line by line + scanner := bufio.NewScanner(file) + + // Iterate over each line + collect := false + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, routine) { + collect = true + } + if collect { + lines = append(lines, line) + } + if collect && strings.HasSuffix(line, "RET") { + collect = false + } + } + + // Check for any errors that occurred during scanning + err = scanner.Err() + return +} + +func addArmInitializations(instructions []string) (processed []string) { + for _, instr := range instructions { + processed = append(processed, instr) + if strings.HasPrefix(instr, "TEXT ·") { + sve := "ptrue p0.d" + opcode, err := sve_as.Assemble(sve) + if err != nil { + processed = append(processed, fmt.Sprintf(" WORD $0x00000000 // %-44s\n", sve)) + } else { + processed = append(processed, fmt.Sprintf(" WORD $0x%08x // %-44s\n", opcode, sve)) + } + } + } + return +} + +// Expand #defines +func expandHashDefines(instructions []string) (processed []string) { + for _, instr := range instructions { + if strings.Contains(instr, "XOR3WAY") { + f := strings.Fields(instr) + if len(f) >= 3 { + dst := strings.ReplaceAll(f[len(f)-1], ")", "") + b := strings.ReplaceAll(f[len(f)-2], ",", "") + a := strings.ReplaceAll(f[len(f)-3], ",", "") + + processed = append(processed, fmt.Sprintf("VPXOR %s, %s, %s", a, dst, dst)) + processed = append(processed, fmt.Sprintf("VPXOR %s, %s, %s", b, dst, dst)) + } else { + log.Fatalf("Not enough arguments for 'XOR3WAY' macro: %d", len(f)) + } + } else if !strings.Contains(instr, "VZEROUPPER") { + processed = append(processed, instr) + } + } + return +} + +func convertRoutine(asmBuf *bytes.Buffer, instructions []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + wordOpcode := regexp.MustCompile(`WORD \$0x[0-9a-f]{8}`) + + for _, instr := range instructions { + instr = strings.TrimSpace(instr) + if instr == "" { + asmF("\n") + } else if strings.HasPrefix(instr, "TEXT ") { // function header + asmF("%s\n", patchLabel(instr)) + } else if wordOpcode.MatchString(instr) { // arm code + asmF(" %s\n", instr) + } else if strings.HasPrefix(instr, "//") { // comment + asmF(" %s\n", instr) + } else if strings.HasSuffix(instr, ":") { // label + asmF("%s\n", patchLabel(instr)) + } else { + sve, plan9, err := avxtwo2sve.AvxTwo2Sve(instr, patchLabel) + if err != nil { + panic(err) + } else if !plan9 { + opcode, err := sve_as.Assemble(sve) + if err != nil { + asmF(" WORD $0x00000000 // %-44s\n", sve) + } else { + asmF(" WORD $0x%08x // %-44s\n", opcode, sve) + } + } else { + asmF(" %s\n", sve) + } + } + } +} + +func fromAvx2ToSve() { + asmOut, goOut := &bytes.Buffer{}, &bytes.Buffer{} + + goOut.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT.` + "\n\n") + goOut.WriteString("//go:build !noasm && !appengine && !gccgo && !nopshufb\n\n") + goOut.WriteString("package reedsolomon\n\n") + + const input = 10 + const AVX2_CODE = "../galois_gen_amd64.s" + + // Processing 64 bytes variants + for output := 1; output <= 3; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulAvxTwo_%dx%d_64%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "AvxTwo", "Sve")) + + // asm first + lines, err := extractRoutine(AVX2_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + lines = expandHashDefines(lines) + + convertRoutine(asmOut, lines) + + // add newline after RET + asmOut.WriteString("\n") + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + // Processing 32 bytes variants + for output := 4; output <= 10; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulAvxTwo_%dx%d%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "AvxTwo", "Sve")) + + // asm first + lines, err := extractRoutine(AVX2_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + lines = expandHashDefines(lines) + + // add additional initialization for SVE + // (for predicated loads and stores in + // case of register shortage) + lines = addArmInitializations(lines) + + convertRoutine(asmOut, lines) + + // add newline after RET + asmOut.WriteString("\n") + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } + if err := os.WriteFile("../galois_gen_arm64.go", goOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} + +func insertEarlyExit(lines []string, funcName string, outputs int) (processed []string) { + + const reg = "R16" + label := funcName + "_store" + + reComment := regexp.MustCompile(fmt.Sprintf(`// Load and process \d* bytes from input (\d*) to %d outputs`, outputs)) + reLoop := regexp.MustCompile(`^` + strings.ReplaceAll(label, "store", "loop") + `:`) + reStore := regexp.MustCompile(fmt.Sprintf(`// Store %d outputs`, outputs)) + + for _, line := range lines { + if matches := reLoop.FindAllStringSubmatch(line, -1); len(matches) == 1 { + lastline := processed[len(processed)-1] + processed = processed[:len(processed)-1] + processed = append(processed, "") + processed = append(processed, fmt.Sprintf(" // Load number of input shards")) + processed = append(processed, fmt.Sprintf(" MOVD in_len+32(FP), %s", reg)) + processed = append(processed, lastline) + } + + if matches := reComment.FindAllStringSubmatch(line, -1); len(matches) == 1 { + if inputs, err := strconv.Atoi(matches[0][1]); err != nil { + panic(err) + } else { + if inputs > 0 && inputs < 10 { + lastline := processed[len(processed)-1] + processed = processed[:len(processed)-1] + processed = append(processed, fmt.Sprintf(" // Check for early termination")) + processed = append(processed, fmt.Sprintf(" CMP $%d, %s", inputs, reg)) + processed = append(processed, fmt.Sprintf(" BEQ %s", label)) + processed = append(processed, lastline) + } + } + } + + if matches := reStore.FindAllStringSubmatch(line, -1); len(matches) == 1 { + processed = append(processed, fmt.Sprintf("%s:", label)) + } + + processed = append(processed, line) + } + return +} + +func addEarlyExit(arch string) { + const filename = "../galois_gen_arm64.s" + asmOut := &bytes.Buffer{} + + asmOut.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT.` + "\n\n") + asmOut.WriteString("//go:build !appengine && !noasm && !nogen && !nopshufb && gc\n\n") + asmOut.WriteString(`#include "textflag.h"` + "\n\n") + + input := 10 + for outputs := 1; outputs <= 3; outputs++ { + for op := ""; len(op) <= 3; op += "Xor" { + funcName := fmt.Sprintf("mul%s_%dx%d_64%s", arch, input, outputs, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", funcName) + + lines, _ := extractRoutine(filename, fmt.Sprintf("TEXT ·%s(SB)", funcName)) + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: SVE\n") + + lines = insertEarlyExit(lines, funcName, outputs) + + asmOut.WriteString(strings.Join(lines, "\n")) + asmOut.WriteString("\n\n") + } + } + + for outputs := 4; outputs <= 10; outputs++ { + for op := ""; len(op) <= 3; op += "Xor" { + funcName := fmt.Sprintf("mul%s_%dx%d%s", arch, input, outputs, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", funcName) + + lines, _ := extractRoutine(filename, fmt.Sprintf("TEXT ·%s(SB)", funcName)) + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: SVE\n") + + lines = insertEarlyExit(lines, funcName, outputs) + asmOut.WriteString(strings.Join(lines, "\n")) + asmOut.WriteString("\n\n") + } + } + + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} + +func genArmSve() { + fromAvx2ToSve() + addEarlyExit("Sve") +} diff --git a/_gen/gen.go b/_gen/gen.go index f42cdec4..c1b92b8d 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -89,6 +89,11 @@ func main() { genSwitch() genGF16() genGF8() + + if pshufb { + genArmSve() + genArmNeon() + } Generate() } @@ -449,7 +454,10 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { for _, ptr := range inPtrs { ADDQ(offset, ptr) } - // Offset no longer needed unless not regdst + // Offset no longer needed unless not regDst + if !regDst { + SHRQ(U8(3), offset) // divide by 8 since we'll be scaling it up when loading or storing + } tmpMask := GP64() MOVQ(U32(15), tmpMask) @@ -478,9 +486,9 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) - VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 8}, dst[i]) if prefetchDst > 0 { - PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 8}) } } } @@ -508,9 +516,9 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { } else { ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: j * 24}, ptr) - VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[j]) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 8}, dst[j]) if prefetchDst > 0 { - PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 8}) } } } @@ -543,14 +551,14 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) - VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) + VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 8}) if prefetchDst > 0 && !xor { - PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 8}) } } Comment("Prepare for next loop") if !regDst { - ADDQ(U8(perLoop), offset) + ADDQ(U8(perLoop>>3), offset) } DECQ(length) JNZ(LabelRef(name + "_loop")) diff --git a/_gen/go.mod b/_gen/go.mod index f6d1dca4..d1406bf1 100644 --- a/_gen/go.mod +++ b/_gen/go.mod @@ -1,6 +1,6 @@ module github.com/klauspost/reedsolomon/_gen -go 1.19 +go 1.21.5 require ( github.com/klauspost/asmfmt v1.3.1 @@ -8,6 +8,8 @@ require ( ) require ( + github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 // indirect + github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f // indirect golang.org/x/mod v0.6.0 // indirect golang.org/x/sys v0.1.0 // indirect golang.org/x/tools v0.2.0 // indirect diff --git a/_gen/go.sum b/_gen/go.sum index 5aa25310..4938f100 100644 --- a/_gen/go.sum +++ b/_gen/go.sum @@ -1,3 +1,7 @@ +github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 h1:omdgAKxePZxbMC7HZPw99QMPeH7fKh3t2QRSZ0YFA/0= +github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471/go.mod h1:9+ibRsEIs0vLXkalKCGEbZfVS4fafeIvMvM9GvIsdeQ= +github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f h1:HQud3yIU82LdkQzHEYiSJs73wCHjprIqeZE9JvSjKbQ= +github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f/go.mod h1:j3s7EY79XxNMyjx/54Vo6asZafWU4yijB+KIfj4hrh8= github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw= github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/mmcloughlin/avo v0.5.1-0.20221128045730-bf1d05562091 h1:C2c8ttOBeyhs1SvyCXVPCFd0EqtPiTKGnMWQ+JkM0Lc= diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 8ff74bf4..c99d3c49 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -28842,6 +28842,7 @@ TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX + SHRQ $0x03, R9 MOVQ $0x0000000f, R10 MOVQ R10, X10 VPBROADCASTB X10, Y10 @@ -29077,28 +29078,28 @@ mulAvxTwo_4x10_loop: // Store 10 outputs MOVQ (R8), R10 - VMOVDQU Y0, (R10)(R9*1) + VMOVDQU Y0, (R10)(R9*8) MOVQ 24(R8), R10 - VMOVDQU Y1, (R10)(R9*1) + VMOVDQU Y1, (R10)(R9*8) MOVQ 48(R8), R10 - VMOVDQU Y2, (R10)(R9*1) + VMOVDQU Y2, (R10)(R9*8) MOVQ 72(R8), R10 - VMOVDQU Y3, (R10)(R9*1) + VMOVDQU Y3, (R10)(R9*8) MOVQ 96(R8), R10 - VMOVDQU Y4, (R10)(R9*1) + VMOVDQU Y4, (R10)(R9*8) MOVQ 120(R8), R10 - VMOVDQU Y5, (R10)(R9*1) + VMOVDQU Y5, (R10)(R9*8) MOVQ 144(R8), R10 - VMOVDQU Y6, (R10)(R9*1) + VMOVDQU Y6, (R10)(R9*8) MOVQ 168(R8), R10 - VMOVDQU Y7, (R10)(R9*1) + VMOVDQU Y7, (R10)(R9*8) MOVQ 192(R8), R10 - VMOVDQU Y8, (R10)(R9*1) + VMOVDQU Y8, (R10)(R9*8) MOVQ 216(R8), R10 - VMOVDQU Y9, (R10)(R9*1) + VMOVDQU Y9, (R10)(R9*8) // Prepare for next loop - ADDQ $0x20, R9 + ADDQ $0x04, R9 DECQ AX JNZ mulAvxTwo_4x10_loop VZEROUPPER @@ -29890,6 +29891,7 @@ TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX + SHRQ $0x03, R9 MOVQ $0x0000000f, R10 MOVQ R10, X10 VPBROADCASTB X10, Y10 @@ -29902,70 +29904,70 @@ mulAvxTwo_4x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R8), R10 - VMOVDQU (R10)(R9*1), Y0 + VMOVDQU (R10)(R9*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R8), R10 - VMOVDQU (R10)(R9*1), Y1 + VMOVDQU (R10)(R9*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R8), R10 - VMOVDQU (R10)(R9*1), Y2 + VMOVDQU (R10)(R9*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R8), R10 - VMOVDQU (R10)(R9*1), Y3 + VMOVDQU (R10)(R9*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R8), R10 - VMOVDQU (R10)(R9*1), Y4 + VMOVDQU (R10)(R9*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R8), R10 - VMOVDQU (R10)(R9*1), Y5 + VMOVDQU (R10)(R9*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R8), R10 - VMOVDQU (R10)(R9*1), Y6 + VMOVDQU (R10)(R9*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R8), R10 - VMOVDQU (R10)(R9*1), Y7 + VMOVDQU (R10)(R9*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R8), R10 - VMOVDQU (R10)(R9*1), Y8 + VMOVDQU (R10)(R9*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R8), R10 - VMOVDQU (R10)(R9*1), Y9 + VMOVDQU (R10)(R9*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -30145,28 +30147,28 @@ mulAvxTwo_4x10Xor_loop: // Store 10 outputs MOVQ (R8), R10 - VMOVDQU Y0, (R10)(R9*1) + VMOVDQU Y0, (R10)(R9*8) MOVQ 24(R8), R10 - VMOVDQU Y1, (R10)(R9*1) + VMOVDQU Y1, (R10)(R9*8) MOVQ 48(R8), R10 - VMOVDQU Y2, (R10)(R9*1) + VMOVDQU Y2, (R10)(R9*8) MOVQ 72(R8), R10 - VMOVDQU Y3, (R10)(R9*1) + VMOVDQU Y3, (R10)(R9*8) MOVQ 96(R8), R10 - VMOVDQU Y4, (R10)(R9*1) + VMOVDQU Y4, (R10)(R9*8) MOVQ 120(R8), R10 - VMOVDQU Y5, (R10)(R9*1) + VMOVDQU Y5, (R10)(R9*8) MOVQ 144(R8), R10 - VMOVDQU Y6, (R10)(R9*1) + VMOVDQU Y6, (R10)(R9*8) MOVQ 168(R8), R10 - VMOVDQU Y7, (R10)(R9*1) + VMOVDQU Y7, (R10)(R9*8) MOVQ 192(R8), R10 - VMOVDQU Y8, (R10)(R9*1) + VMOVDQU Y8, (R10)(R9*8) MOVQ 216(R8), R10 - VMOVDQU Y9, (R10)(R9*1) + VMOVDQU Y9, (R10)(R9*8) // Prepare for next loop - ADDQ $0x20, R9 + ADDQ $0x04, R9 DECQ AX JNZ mulAvxTwo_4x10Xor_loop VZEROUPPER @@ -38564,6 +38566,7 @@ TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX + SHRQ $0x03, R10 MOVQ $0x0000000f, R11 MOVQ R11, X9 VPBROADCASTB X9, Y9 @@ -38831,26 +38834,26 @@ mulAvxTwo_5x9_loop: // Store 9 outputs MOVQ (R9), R11 - VMOVDQU Y0, (R11)(R10*1) + VMOVDQU Y0, (R11)(R10*8) MOVQ 24(R9), R11 - VMOVDQU Y1, (R11)(R10*1) + VMOVDQU Y1, (R11)(R10*8) MOVQ 48(R9), R11 - VMOVDQU Y2, (R11)(R10*1) + VMOVDQU Y2, (R11)(R10*8) MOVQ 72(R9), R11 - VMOVDQU Y3, (R11)(R10*1) + VMOVDQU Y3, (R11)(R10*8) MOVQ 96(R9), R11 - VMOVDQU Y4, (R11)(R10*1) + VMOVDQU Y4, (R11)(R10*8) MOVQ 120(R9), R11 - VMOVDQU Y5, (R11)(R10*1) + VMOVDQU Y5, (R11)(R10*8) MOVQ 144(R9), R11 - VMOVDQU Y6, (R11)(R10*1) + VMOVDQU Y6, (R11)(R10*8) MOVQ 168(R9), R11 - VMOVDQU Y7, (R11)(R10*1) + VMOVDQU Y7, (R11)(R10*8) MOVQ 192(R9), R11 - VMOVDQU Y8, (R11)(R10*1) + VMOVDQU Y8, (R11)(R10*8) // Prepare for next loop - ADDQ $0x20, R10 + ADDQ $0x04, R10 DECQ AX JNZ mulAvxTwo_5x9_loop VZEROUPPER @@ -39710,6 +39713,7 @@ TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX + SHRQ $0x03, R10 MOVQ $0x0000000f, R11 MOVQ R11, X9 VPBROADCASTB X9, Y9 @@ -39722,63 +39726,63 @@ mulAvxTwo_5x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R9), R11 - VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (R11)(R10*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R9), R11 - VMOVDQU (R11)(R10*1), Y1 + VMOVDQU (R11)(R10*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R9), R11 - VMOVDQU (R11)(R10*1), Y2 + VMOVDQU (R11)(R10*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R9), R11 - VMOVDQU (R11)(R10*1), Y3 + VMOVDQU (R11)(R10*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R9), R11 - VMOVDQU (R11)(R10*1), Y4 + VMOVDQU (R11)(R10*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R9), R11 - VMOVDQU (R11)(R10*1), Y5 + VMOVDQU (R11)(R10*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R9), R11 - VMOVDQU (R11)(R10*1), Y6 + VMOVDQU (R11)(R10*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R9), R11 - VMOVDQU (R11)(R10*1), Y7 + VMOVDQU (R11)(R10*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R9), R11 - VMOVDQU (R11)(R10*1), Y8 + VMOVDQU (R11)(R10*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -39995,26 +39999,26 @@ mulAvxTwo_5x9Xor_loop: // Store 9 outputs MOVQ (R9), R11 - VMOVDQU Y0, (R11)(R10*1) + VMOVDQU Y0, (R11)(R10*8) MOVQ 24(R9), R11 - VMOVDQU Y1, (R11)(R10*1) + VMOVDQU Y1, (R11)(R10*8) MOVQ 48(R9), R11 - VMOVDQU Y2, (R11)(R10*1) + VMOVDQU Y2, (R11)(R10*8) MOVQ 72(R9), R11 - VMOVDQU Y3, (R11)(R10*1) + VMOVDQU Y3, (R11)(R10*8) MOVQ 96(R9), R11 - VMOVDQU Y4, (R11)(R10*1) + VMOVDQU Y4, (R11)(R10*8) MOVQ 120(R9), R11 - VMOVDQU Y5, (R11)(R10*1) + VMOVDQU Y5, (R11)(R10*8) MOVQ 144(R9), R11 - VMOVDQU Y6, (R11)(R10*1) + VMOVDQU Y6, (R11)(R10*8) MOVQ 168(R9), R11 - VMOVDQU Y7, (R11)(R10*1) + VMOVDQU Y7, (R11)(R10*8) MOVQ 192(R9), R11 - VMOVDQU Y8, (R11)(R10*1) + VMOVDQU Y8, (R11)(R10*8) // Prepare for next loop - ADDQ $0x20, R10 + ADDQ $0x04, R10 DECQ AX JNZ mulAvxTwo_5x9Xor_loop VZEROUPPER @@ -40048,6 +40052,7 @@ TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX + SHRQ $0x03, R10 MOVQ $0x0000000f, R11 MOVQ R11, X10 VPBROADCASTB X10, Y10 @@ -40340,28 +40345,28 @@ mulAvxTwo_5x10_loop: // Store 10 outputs MOVQ (R9), R11 - VMOVDQU Y0, (R11)(R10*1) + VMOVDQU Y0, (R11)(R10*8) MOVQ 24(R9), R11 - VMOVDQU Y1, (R11)(R10*1) + VMOVDQU Y1, (R11)(R10*8) MOVQ 48(R9), R11 - VMOVDQU Y2, (R11)(R10*1) + VMOVDQU Y2, (R11)(R10*8) MOVQ 72(R9), R11 - VMOVDQU Y3, (R11)(R10*1) + VMOVDQU Y3, (R11)(R10*8) MOVQ 96(R9), R11 - VMOVDQU Y4, (R11)(R10*1) + VMOVDQU Y4, (R11)(R10*8) MOVQ 120(R9), R11 - VMOVDQU Y5, (R11)(R10*1) + VMOVDQU Y5, (R11)(R10*8) MOVQ 144(R9), R11 - VMOVDQU Y6, (R11)(R10*1) + VMOVDQU Y6, (R11)(R10*8) MOVQ 168(R9), R11 - VMOVDQU Y7, (R11)(R10*1) + VMOVDQU Y7, (R11)(R10*8) MOVQ 192(R9), R11 - VMOVDQU Y8, (R11)(R10*1) + VMOVDQU Y8, (R11)(R10*8) MOVQ 216(R9), R11 - VMOVDQU Y9, (R11)(R10*1) + VMOVDQU Y9, (R11)(R10*8) // Prepare for next loop - ADDQ $0x20, R10 + ADDQ $0x04, R10 DECQ AX JNZ mulAvxTwo_5x10_loop VZEROUPPER @@ -41279,6 +41284,7 @@ TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX + SHRQ $0x03, R10 MOVQ $0x0000000f, R11 MOVQ R11, X10 VPBROADCASTB X10, Y10 @@ -41291,70 +41297,70 @@ mulAvxTwo_5x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R9), R11 - VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (R11)(R10*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R9), R11 - VMOVDQU (R11)(R10*1), Y1 + VMOVDQU (R11)(R10*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R9), R11 - VMOVDQU (R11)(R10*1), Y2 + VMOVDQU (R11)(R10*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R9), R11 - VMOVDQU (R11)(R10*1), Y3 + VMOVDQU (R11)(R10*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R9), R11 - VMOVDQU (R11)(R10*1), Y4 + VMOVDQU (R11)(R10*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R9), R11 - VMOVDQU (R11)(R10*1), Y5 + VMOVDQU (R11)(R10*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R9), R11 - VMOVDQU (R11)(R10*1), Y6 + VMOVDQU (R11)(R10*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R9), R11 - VMOVDQU (R11)(R10*1), Y7 + VMOVDQU (R11)(R10*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R9), R11 - VMOVDQU (R11)(R10*1), Y8 + VMOVDQU (R11)(R10*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R9), R11 - VMOVDQU (R11)(R10*1), Y9 + VMOVDQU (R11)(R10*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -41591,28 +41597,28 @@ mulAvxTwo_5x10Xor_loop: // Store 10 outputs MOVQ (R9), R11 - VMOVDQU Y0, (R11)(R10*1) + VMOVDQU Y0, (R11)(R10*8) MOVQ 24(R9), R11 - VMOVDQU Y1, (R11)(R10*1) + VMOVDQU Y1, (R11)(R10*8) MOVQ 48(R9), R11 - VMOVDQU Y2, (R11)(R10*1) + VMOVDQU Y2, (R11)(R10*8) MOVQ 72(R9), R11 - VMOVDQU Y3, (R11)(R10*1) + VMOVDQU Y3, (R11)(R10*8) MOVQ 96(R9), R11 - VMOVDQU Y4, (R11)(R10*1) + VMOVDQU Y4, (R11)(R10*8) MOVQ 120(R9), R11 - VMOVDQU Y5, (R11)(R10*1) + VMOVDQU Y5, (R11)(R10*8) MOVQ 144(R9), R11 - VMOVDQU Y6, (R11)(R10*1) + VMOVDQU Y6, (R11)(R10*8) MOVQ 168(R9), R11 - VMOVDQU Y7, (R11)(R10*1) + VMOVDQU Y7, (R11)(R10*8) MOVQ 192(R9), R11 - VMOVDQU Y8, (R11)(R10*1) + VMOVDQU Y8, (R11)(R10*8) MOVQ 216(R9), R11 - VMOVDQU Y9, (R11)(R10*1) + VMOVDQU Y9, (R11)(R10*8) // Prepare for next loop - ADDQ $0x20, R10 + ADDQ $0x04, R10 DECQ AX JNZ mulAvxTwo_5x10Xor_loop VZEROUPPER @@ -49498,6 +49504,7 @@ TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X8 VPBROADCASTB X8, Y8 @@ -49787,24 +49794,24 @@ mulAvxTwo_6x8_loop: // Store 8 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x8_loop VZEROUPPER @@ -50712,6 +50719,7 @@ TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X8 VPBROADCASTB X8, Y8 @@ -50724,56 +50732,56 @@ mulAvxTwo_6x8Xor_loop: VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R10), R12 - VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (R12)(R11*8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R10), R12 - VMOVDQU (R12)(R11*1), Y1 + VMOVDQU (R12)(R11*8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R10), R12 - VMOVDQU (R12)(R11*1), Y2 + VMOVDQU (R12)(R11*8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R10), R12 - VMOVDQU (R12)(R11*1), Y3 + VMOVDQU (R12)(R11*8), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R10), R12 - VMOVDQU (R12)(R11*1), Y4 + VMOVDQU (R12)(R11*8), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R10), R12 - VMOVDQU (R12)(R11*1), Y5 + VMOVDQU (R12)(R11*8), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R10), R12 - VMOVDQU (R12)(R11*1), Y6 + VMOVDQU (R12)(R11*8), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R10), R12 - VMOVDQU (R12)(R11*1), Y7 + VMOVDQU (R12)(R11*8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -51017,24 +51025,24 @@ mulAvxTwo_6x8Xor_loop: // Store 8 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x8Xor_loop VZEROUPPER @@ -51070,6 +51078,7 @@ TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X9 VPBROADCASTB X9, Y9 @@ -51389,26 +51398,26 @@ mulAvxTwo_6x9_loop: // Store 9 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) + VMOVDQU Y8, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x9_loop VZEROUPPER @@ -52384,6 +52393,7 @@ TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X9 VPBROADCASTB X9, Y9 @@ -52396,63 +52406,63 @@ mulAvxTwo_6x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R10), R12 - VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (R12)(R11*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R10), R12 - VMOVDQU (R12)(R11*1), Y1 + VMOVDQU (R12)(R11*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R10), R12 - VMOVDQU (R12)(R11*1), Y2 + VMOVDQU (R12)(R11*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R10), R12 - VMOVDQU (R12)(R11*1), Y3 + VMOVDQU (R12)(R11*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R10), R12 - VMOVDQU (R12)(R11*1), Y4 + VMOVDQU (R12)(R11*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R10), R12 - VMOVDQU (R12)(R11*1), Y5 + VMOVDQU (R12)(R11*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R10), R12 - VMOVDQU (R12)(R11*1), Y6 + VMOVDQU (R12)(R11*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R10), R12 - VMOVDQU (R12)(R11*1), Y7 + VMOVDQU (R12)(R11*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R10), R12 - VMOVDQU (R12)(R11*1), Y8 + VMOVDQU (R12)(R11*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -52721,26 +52731,26 @@ mulAvxTwo_6x9Xor_loop: // Store 9 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) + VMOVDQU Y8, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x9Xor_loop VZEROUPPER @@ -52776,6 +52786,7 @@ TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X10 VPBROADCASTB X10, Y10 @@ -53125,28 +53136,28 @@ mulAvxTwo_6x10_loop: // Store 10 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) + VMOVDQU Y8, (R12)(R11*8) MOVQ 216(R10), R12 - VMOVDQU Y9, (R12)(R11*1) + VMOVDQU Y9, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x10_loop VZEROUPPER @@ -54190,6 +54201,7 @@ TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX + SHRQ $0x03, R11 MOVQ $0x0000000f, R12 MOVQ R12, X10 VPBROADCASTB X10, Y10 @@ -54202,70 +54214,70 @@ mulAvxTwo_6x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R10), R12 - VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (R12)(R11*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R10), R12 - VMOVDQU (R12)(R11*1), Y1 + VMOVDQU (R12)(R11*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R10), R12 - VMOVDQU (R12)(R11*1), Y2 + VMOVDQU (R12)(R11*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R10), R12 - VMOVDQU (R12)(R11*1), Y3 + VMOVDQU (R12)(R11*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R10), R12 - VMOVDQU (R12)(R11*1), Y4 + VMOVDQU (R12)(R11*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R10), R12 - VMOVDQU (R12)(R11*1), Y5 + VMOVDQU (R12)(R11*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R10), R12 - VMOVDQU (R12)(R11*1), Y6 + VMOVDQU (R12)(R11*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R10), R12 - VMOVDQU (R12)(R11*1), Y7 + VMOVDQU (R12)(R11*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R10), R12 - VMOVDQU (R12)(R11*1), Y8 + VMOVDQU (R12)(R11*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R10), R12 - VMOVDQU (R12)(R11*1), Y9 + VMOVDQU (R12)(R11*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -54559,28 +54571,28 @@ mulAvxTwo_6x10Xor_loop: // Store 10 outputs MOVQ (R10), R12 - VMOVDQU Y0, (R12)(R11*1) + VMOVDQU Y0, (R12)(R11*8) MOVQ 24(R10), R12 - VMOVDQU Y1, (R12)(R11*1) + VMOVDQU Y1, (R12)(R11*8) MOVQ 48(R10), R12 - VMOVDQU Y2, (R12)(R11*1) + VMOVDQU Y2, (R12)(R11*8) MOVQ 72(R10), R12 - VMOVDQU Y3, (R12)(R11*1) + VMOVDQU Y3, (R12)(R11*8) MOVQ 96(R10), R12 - VMOVDQU Y4, (R12)(R11*1) + VMOVDQU Y4, (R12)(R11*8) MOVQ 120(R10), R12 - VMOVDQU Y5, (R12)(R11*1) + VMOVDQU Y5, (R12)(R11*8) MOVQ 144(R10), R12 - VMOVDQU Y6, (R12)(R11*1) + VMOVDQU Y6, (R12)(R11*8) MOVQ 168(R10), R12 - VMOVDQU Y7, (R12)(R11*1) + VMOVDQU Y7, (R12)(R11*8) MOVQ 192(R10), R12 - VMOVDQU Y8, (R12)(R11*1) + VMOVDQU Y8, (R12)(R11*8) MOVQ 216(R10), R12 - VMOVDQU Y9, (R12)(R11*1) + VMOVDQU Y9, (R12)(R11*8) // Prepare for next loop - ADDQ $0x20, R11 + ADDQ $0x04, R11 DECQ AX JNZ mulAvxTwo_6x10Xor_loop VZEROUPPER @@ -61709,6 +61721,7 @@ TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 @@ -62010,22 +62023,22 @@ mulAvxTwo_7x7_loop: // Store 7 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x7_loop VZEROUPPER @@ -62961,6 +62974,7 @@ TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 @@ -62973,49 +62987,49 @@ mulAvxTwo_7x7Xor_loop: VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R11), R13 - VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (R13)(R12*8), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R11), R13 - VMOVDQU (R13)(R12*1), Y1 + VMOVDQU (R13)(R12*8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R11), R13 - VMOVDQU (R13)(R12*1), Y2 + VMOVDQU (R13)(R12*8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R11), R13 - VMOVDQU (R13)(R12*1), Y3 + VMOVDQU (R13)(R12*8), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R11), R13 - VMOVDQU (R13)(R12*1), Y4 + VMOVDQU (R13)(R12*8), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R11), R13 - VMOVDQU (R13)(R12*1), Y5 + VMOVDQU (R13)(R12*8), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R11), R13 - VMOVDQU (R13)(R12*1), Y6 + VMOVDQU (R13)(R12*8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -63276,22 +63290,22 @@ mulAvxTwo_7x7Xor_loop: // Store 7 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x7Xor_loop VZEROUPPER @@ -63329,6 +63343,7 @@ TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 @@ -63665,24 +63680,24 @@ mulAvxTwo_7x8_loop: // Store 8 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x8_loop VZEROUPPER @@ -64696,6 +64711,7 @@ TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 @@ -64708,56 +64724,56 @@ mulAvxTwo_7x8Xor_loop: VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R11), R13 - VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (R13)(R12*8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R11), R13 - VMOVDQU (R13)(R12*1), Y1 + VMOVDQU (R13)(R12*8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R11), R13 - VMOVDQU (R13)(R12*1), Y2 + VMOVDQU (R13)(R12*8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R11), R13 - VMOVDQU (R13)(R12*1), Y3 + VMOVDQU (R13)(R12*8), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R11), R13 - VMOVDQU (R13)(R12*1), Y4 + VMOVDQU (R13)(R12*8), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R11), R13 - VMOVDQU (R13)(R12*1), Y5 + VMOVDQU (R13)(R12*8), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R11), R13 - VMOVDQU (R13)(R12*1), Y6 + VMOVDQU (R13)(R12*8), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R11), R13 - VMOVDQU (R13)(R12*1), Y7 + VMOVDQU (R13)(R12*8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -65048,24 +65064,24 @@ mulAvxTwo_7x8Xor_loop: // Store 8 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x8Xor_loop VZEROUPPER @@ -65103,6 +65119,7 @@ TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X9 VPBROADCASTB X9, Y9 @@ -65474,26 +65491,26 @@ mulAvxTwo_7x9_loop: // Store 9 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) + VMOVDQU Y8, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x9_loop VZEROUPPER @@ -66585,6 +66602,7 @@ TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X9 VPBROADCASTB X9, Y9 @@ -66597,63 +66615,63 @@ mulAvxTwo_7x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R11), R13 - VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (R13)(R12*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R11), R13 - VMOVDQU (R13)(R12*1), Y1 + VMOVDQU (R13)(R12*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R11), R13 - VMOVDQU (R13)(R12*1), Y2 + VMOVDQU (R13)(R12*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R11), R13 - VMOVDQU (R13)(R12*1), Y3 + VMOVDQU (R13)(R12*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R11), R13 - VMOVDQU (R13)(R12*1), Y4 + VMOVDQU (R13)(R12*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R11), R13 - VMOVDQU (R13)(R12*1), Y5 + VMOVDQU (R13)(R12*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R11), R13 - VMOVDQU (R13)(R12*1), Y6 + VMOVDQU (R13)(R12*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R11), R13 - VMOVDQU (R13)(R12*1), Y7 + VMOVDQU (R13)(R12*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R11), R13 - VMOVDQU (R13)(R12*1), Y8 + VMOVDQU (R13)(R12*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -66974,26 +66992,26 @@ mulAvxTwo_7x9Xor_loop: // Store 9 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) + VMOVDQU Y8, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x9Xor_loop VZEROUPPER @@ -67031,6 +67049,7 @@ TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X10 VPBROADCASTB X10, Y10 @@ -67437,28 +67456,28 @@ mulAvxTwo_7x10_loop: // Store 10 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) + VMOVDQU Y8, (R13)(R12*8) MOVQ 216(R11), R13 - VMOVDQU Y9, (R13)(R12*1) + VMOVDQU Y9, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x10_loop VZEROUPPER @@ -68628,6 +68647,7 @@ TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX + SHRQ $0x03, R12 MOVQ $0x0000000f, R13 MOVQ R13, X10 VPBROADCASTB X10, Y10 @@ -68640,70 +68660,70 @@ mulAvxTwo_7x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R11), R13 - VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (R13)(R12*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R11), R13 - VMOVDQU (R13)(R12*1), Y1 + VMOVDQU (R13)(R12*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R11), R13 - VMOVDQU (R13)(R12*1), Y2 + VMOVDQU (R13)(R12*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R11), R13 - VMOVDQU (R13)(R12*1), Y3 + VMOVDQU (R13)(R12*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R11), R13 - VMOVDQU (R13)(R12*1), Y4 + VMOVDQU (R13)(R12*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R11), R13 - VMOVDQU (R13)(R12*1), Y5 + VMOVDQU (R13)(R12*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R11), R13 - VMOVDQU (R13)(R12*1), Y6 + VMOVDQU (R13)(R12*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R11), R13 - VMOVDQU (R13)(R12*1), Y7 + VMOVDQU (R13)(R12*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R11), R13 - VMOVDQU (R13)(R12*1), Y8 + VMOVDQU (R13)(R12*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R11), R13 - VMOVDQU (R13)(R12*1), Y9 + VMOVDQU (R13)(R12*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -69054,28 +69074,28 @@ mulAvxTwo_7x10Xor_loop: // Store 10 outputs MOVQ (R11), R13 - VMOVDQU Y0, (R13)(R12*1) + VMOVDQU Y0, (R13)(R12*8) MOVQ 24(R11), R13 - VMOVDQU Y1, (R13)(R12*1) + VMOVDQU Y1, (R13)(R12*8) MOVQ 48(R11), R13 - VMOVDQU Y2, (R13)(R12*1) + VMOVDQU Y2, (R13)(R12*8) MOVQ 72(R11), R13 - VMOVDQU Y3, (R13)(R12*1) + VMOVDQU Y3, (R13)(R12*8) MOVQ 96(R11), R13 - VMOVDQU Y4, (R13)(R12*1) + VMOVDQU Y4, (R13)(R12*8) MOVQ 120(R11), R13 - VMOVDQU Y5, (R13)(R12*1) + VMOVDQU Y5, (R13)(R12*8) MOVQ 144(R11), R13 - VMOVDQU Y6, (R13)(R12*1) + VMOVDQU Y6, (R13)(R12*8) MOVQ 168(R11), R13 - VMOVDQU Y7, (R13)(R12*1) + VMOVDQU Y7, (R13)(R12*8) MOVQ 192(R11), R13 - VMOVDQU Y8, (R13)(R12*1) + VMOVDQU Y8, (R13)(R12*8) MOVQ 216(R11), R13 - VMOVDQU Y9, (R13)(R12*1) + VMOVDQU Y9, (R13)(R12*8) // Prepare for next loop - ADDQ $0x20, R12 + ADDQ $0x04, R12 DECQ AX JNZ mulAvxTwo_7x10Xor_loop VZEROUPPER @@ -75262,6 +75282,7 @@ TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 @@ -75565,20 +75586,20 @@ mulAvxTwo_8x6_loop: // Store 6 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x6_loop VZEROUPPER @@ -76522,6 +76543,7 @@ TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 @@ -76534,42 +76556,42 @@ mulAvxTwo_8x6Xor_loop: VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R12), R14 - VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (R14)(R13*8), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R12), R14 - VMOVDQU (R14)(R13*1), Y1 + VMOVDQU (R14)(R13*8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R12), R14 - VMOVDQU (R14)(R13*1), Y2 + VMOVDQU (R14)(R13*8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R12), R14 - VMOVDQU (R14)(R13*1), Y3 + VMOVDQU (R14)(R13*8), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R12), R14 - VMOVDQU (R14)(R13*1), Y4 + VMOVDQU (R14)(R13*8), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R12), R14 - VMOVDQU (R14)(R13*1), Y5 + VMOVDQU (R14)(R13*8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -76837,20 +76859,20 @@ mulAvxTwo_8x6Xor_loop: // Store 6 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x6Xor_loop VZEROUPPER @@ -76890,6 +76912,7 @@ TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 @@ -77233,22 +77256,22 @@ mulAvxTwo_8x7_loop: // Store 7 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x7_loop VZEROUPPER @@ -78280,6 +78303,7 @@ TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 @@ -78292,49 +78316,49 @@ mulAvxTwo_8x7Xor_loop: VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R12), R14 - VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (R14)(R13*8), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R12), R14 - VMOVDQU (R14)(R13*1), Y1 + VMOVDQU (R14)(R13*8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R12), R14 - VMOVDQU (R14)(R13*1), Y2 + VMOVDQU (R14)(R13*8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R12), R14 - VMOVDQU (R14)(R13*1), Y3 + VMOVDQU (R14)(R13*8), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R12), R14 - VMOVDQU (R14)(R13*1), Y4 + VMOVDQU (R14)(R13*8), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R12), R14 - VMOVDQU (R14)(R13*1), Y5 + VMOVDQU (R14)(R13*8), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R12), R14 - VMOVDQU (R14)(R13*1), Y6 + VMOVDQU (R14)(R13*8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -78637,22 +78661,22 @@ mulAvxTwo_8x7Xor_loop: // Store 7 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x7Xor_loop VZEROUPPER @@ -78692,6 +78716,7 @@ TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 @@ -79075,24 +79100,24 @@ mulAvxTwo_8x8_loop: // Store 8 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x8_loop VZEROUPPER @@ -80212,6 +80237,7 @@ TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 @@ -80224,56 +80250,56 @@ mulAvxTwo_8x8Xor_loop: VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R12), R14 - VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (R14)(R13*8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R12), R14 - VMOVDQU (R14)(R13*1), Y1 + VMOVDQU (R14)(R13*8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R12), R14 - VMOVDQU (R14)(R13*1), Y2 + VMOVDQU (R14)(R13*8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R12), R14 - VMOVDQU (R14)(R13*1), Y3 + VMOVDQU (R14)(R13*8), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R12), R14 - VMOVDQU (R14)(R13*1), Y4 + VMOVDQU (R14)(R13*8), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R12), R14 - VMOVDQU (R14)(R13*1), Y5 + VMOVDQU (R14)(R13*8), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R12), R14 - VMOVDQU (R14)(R13*1), Y6 + VMOVDQU (R14)(R13*8), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R12), R14 - VMOVDQU (R14)(R13*1), Y7 + VMOVDQU (R14)(R13*8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -80611,24 +80637,24 @@ mulAvxTwo_8x8Xor_loop: // Store 8 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x8Xor_loop VZEROUPPER @@ -80668,6 +80694,7 @@ TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 @@ -81091,26 +81118,26 @@ mulAvxTwo_8x9_loop: // Store 9 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) + VMOVDQU Y8, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x9_loop VZEROUPPER @@ -82318,6 +82345,7 @@ TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 @@ -82330,63 +82358,63 @@ mulAvxTwo_8x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R12), R14 - VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (R14)(R13*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R12), R14 - VMOVDQU (R14)(R13*1), Y1 + VMOVDQU (R14)(R13*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R12), R14 - VMOVDQU (R14)(R13*1), Y2 + VMOVDQU (R14)(R13*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R12), R14 - VMOVDQU (R14)(R13*1), Y3 + VMOVDQU (R14)(R13*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R12), R14 - VMOVDQU (R14)(R13*1), Y4 + VMOVDQU (R14)(R13*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R12), R14 - VMOVDQU (R14)(R13*1), Y5 + VMOVDQU (R14)(R13*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R12), R14 - VMOVDQU (R14)(R13*1), Y6 + VMOVDQU (R14)(R13*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R12), R14 - VMOVDQU (R14)(R13*1), Y7 + VMOVDQU (R14)(R13*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R12), R14 - VMOVDQU (R14)(R13*1), Y8 + VMOVDQU (R14)(R13*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -82759,26 +82787,26 @@ mulAvxTwo_8x9Xor_loop: // Store 9 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) + VMOVDQU Y8, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x9Xor_loop VZEROUPPER @@ -82818,6 +82846,7 @@ TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X10 VPBROADCASTB X10, Y10 @@ -83281,28 +83310,28 @@ mulAvxTwo_8x10_loop: // Store 10 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) + VMOVDQU Y8, (R14)(R13*8) MOVQ 216(R12), R14 - VMOVDQU Y9, (R14)(R13*1) + VMOVDQU Y9, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x10_loop VZEROUPPER @@ -84598,6 +84627,7 @@ TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX + SHRQ $0x03, R13 MOVQ $0x0000000f, R14 MOVQ R14, X10 VPBROADCASTB X10, Y10 @@ -84610,70 +84640,70 @@ mulAvxTwo_8x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R12), R14 - VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (R14)(R13*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R12), R14 - VMOVDQU (R14)(R13*1), Y1 + VMOVDQU (R14)(R13*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R12), R14 - VMOVDQU (R14)(R13*1), Y2 + VMOVDQU (R14)(R13*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R12), R14 - VMOVDQU (R14)(R13*1), Y3 + VMOVDQU (R14)(R13*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R12), R14 - VMOVDQU (R14)(R13*1), Y4 + VMOVDQU (R14)(R13*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R12), R14 - VMOVDQU (R14)(R13*1), Y5 + VMOVDQU (R14)(R13*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R12), R14 - VMOVDQU (R14)(R13*1), Y6 + VMOVDQU (R14)(R13*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R12), R14 - VMOVDQU (R14)(R13*1), Y7 + VMOVDQU (R14)(R13*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R12), R14 - VMOVDQU (R14)(R13*1), Y8 + VMOVDQU (R14)(R13*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R12), R14 - VMOVDQU (R14)(R13*1), Y9 + VMOVDQU (R14)(R13*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -85081,28 +85111,28 @@ mulAvxTwo_8x10Xor_loop: // Store 10 outputs MOVQ (R12), R14 - VMOVDQU Y0, (R14)(R13*1) + VMOVDQU Y0, (R14)(R13*8) MOVQ 24(R12), R14 - VMOVDQU Y1, (R14)(R13*1) + VMOVDQU Y1, (R14)(R13*8) MOVQ 48(R12), R14 - VMOVDQU Y2, (R14)(R13*1) + VMOVDQU Y2, (R14)(R13*8) MOVQ 72(R12), R14 - VMOVDQU Y3, (R14)(R13*1) + VMOVDQU Y3, (R14)(R13*8) MOVQ 96(R12), R14 - VMOVDQU Y4, (R14)(R13*1) + VMOVDQU Y4, (R14)(R13*8) MOVQ 120(R12), R14 - VMOVDQU Y5, (R14)(R13*1) + VMOVDQU Y5, (R14)(R13*8) MOVQ 144(R12), R14 - VMOVDQU Y6, (R14)(R13*1) + VMOVDQU Y6, (R14)(R13*8) MOVQ 168(R12), R14 - VMOVDQU Y7, (R14)(R13*1) + VMOVDQU Y7, (R14)(R13*8) MOVQ 192(R12), R14 - VMOVDQU Y8, (R14)(R13*1) + VMOVDQU Y8, (R14)(R13*8) MOVQ 216(R12), R14 - VMOVDQU Y9, (R14)(R13*1) + VMOVDQU Y9, (R14)(R13*8) // Prepare for next loop - ADDQ $0x20, R13 + ADDQ $0x04, R13 DECQ AX JNZ mulAvxTwo_8x10Xor_loop VZEROUPPER @@ -90226,6 +90256,7 @@ TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 @@ -90521,18 +90552,18 @@ mulAvxTwo_9x5_loop: // Store 5 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x5_loop VZEROUPPER @@ -91464,6 +91495,7 @@ TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 @@ -91476,35 +91508,35 @@ mulAvxTwo_9x5Xor_loop: VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -91769,18 +91801,18 @@ mulAvxTwo_9x5Xor_loop: // Store 5 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x5Xor_loop VZEROUPPER @@ -91822,6 +91854,7 @@ TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 @@ -92162,20 +92195,20 @@ mulAvxTwo_9x6_loop: // Store 6 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x6_loop VZEROUPPER @@ -93205,6 +93238,7 @@ TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 @@ -93217,42 +93251,42 @@ mulAvxTwo_9x6Xor_loop: VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R13), R15 - VMOVDQU (R15)(R14*1), Y5 + VMOVDQU (R15)(R14*8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -93557,20 +93591,20 @@ mulAvxTwo_9x6Xor_loop: // Store 6 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x6Xor_loop VZEROUPPER @@ -93612,6 +93646,7 @@ TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 @@ -93997,22 +94032,22 @@ mulAvxTwo_9x7_loop: // Store 7 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x7_loop VZEROUPPER @@ -95140,6 +95175,7 @@ TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 @@ -95152,49 +95188,49 @@ mulAvxTwo_9x7Xor_loop: VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R13), R15 - VMOVDQU (R15)(R14*1), Y5 + VMOVDQU (R15)(R14*8), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R13), R15 - VMOVDQU (R15)(R14*1), Y6 + VMOVDQU (R15)(R14*8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -95539,22 +95575,22 @@ mulAvxTwo_9x7Xor_loop: // Store 7 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x7Xor_loop VZEROUPPER @@ -95596,6 +95632,7 @@ TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 @@ -96026,24 +96063,24 @@ mulAvxTwo_9x8_loop: // Store 8 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x8_loop VZEROUPPER @@ -97269,6 +97306,7 @@ TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 @@ -97281,56 +97319,56 @@ mulAvxTwo_9x8Xor_loop: VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R13), R15 - VMOVDQU (R15)(R14*1), Y5 + VMOVDQU (R15)(R14*8), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R13), R15 - VMOVDQU (R15)(R14*1), Y6 + VMOVDQU (R15)(R14*8), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R13), R15 - VMOVDQU (R15)(R14*1), Y7 + VMOVDQU (R15)(R14*8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -97715,24 +97753,24 @@ mulAvxTwo_9x8Xor_loop: // Store 8 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x8Xor_loop VZEROUPPER @@ -97774,6 +97812,7 @@ TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 @@ -98249,26 +98288,26 @@ mulAvxTwo_9x9_loop: // Store 9 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) + VMOVDQU Y8, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x9_loop VZEROUPPER @@ -99592,6 +99631,7 @@ TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 @@ -99604,63 +99644,63 @@ mulAvxTwo_9x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R13), R15 - VMOVDQU (R15)(R14*1), Y5 + VMOVDQU (R15)(R14*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R13), R15 - VMOVDQU (R15)(R14*1), Y6 + VMOVDQU (R15)(R14*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R13), R15 - VMOVDQU (R15)(R14*1), Y7 + VMOVDQU (R15)(R14*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R13), R15 - VMOVDQU (R15)(R14*1), Y8 + VMOVDQU (R15)(R14*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -100085,26 +100125,26 @@ mulAvxTwo_9x9Xor_loop: // Store 9 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) + VMOVDQU Y8, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x9Xor_loop VZEROUPPER @@ -100146,6 +100186,7 @@ TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 @@ -100666,28 +100707,28 @@ mulAvxTwo_9x10_loop: // Store 10 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) + VMOVDQU Y8, (R15)(R14*8) MOVQ 216(R13), R15 - VMOVDQU Y9, (R15)(R14*1) + VMOVDQU Y9, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x10_loop VZEROUPPER @@ -102109,6 +102150,7 @@ TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX + SHRQ $0x03, R14 MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 @@ -102121,70 +102163,70 @@ mulAvxTwo_9x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R13), R15 - VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (R15)(R14*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R13), R15 - VMOVDQU (R15)(R14*1), Y1 + VMOVDQU (R15)(R14*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R13), R15 - VMOVDQU (R15)(R14*1), Y2 + VMOVDQU (R15)(R14*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R13), R15 - VMOVDQU (R15)(R14*1), Y3 + VMOVDQU (R15)(R14*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R13), R15 - VMOVDQU (R15)(R14*1), Y4 + VMOVDQU (R15)(R14*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R13), R15 - VMOVDQU (R15)(R14*1), Y5 + VMOVDQU (R15)(R14*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R13), R15 - VMOVDQU (R15)(R14*1), Y6 + VMOVDQU (R15)(R14*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R13), R15 - VMOVDQU (R15)(R14*1), Y7 + VMOVDQU (R15)(R14*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R13), R15 - VMOVDQU (R15)(R14*1), Y8 + VMOVDQU (R15)(R14*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R13), R15 - VMOVDQU (R15)(R14*1), Y9 + VMOVDQU (R15)(R14*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -102649,28 +102691,28 @@ mulAvxTwo_9x10Xor_loop: // Store 10 outputs MOVQ (R13), R15 - VMOVDQU Y0, (R15)(R14*1) + VMOVDQU Y0, (R15)(R14*8) MOVQ 24(R13), R15 - VMOVDQU Y1, (R15)(R14*1) + VMOVDQU Y1, (R15)(R14*8) MOVQ 48(R13), R15 - VMOVDQU Y2, (R15)(R14*1) + VMOVDQU Y2, (R15)(R14*8) MOVQ 72(R13), R15 - VMOVDQU Y3, (R15)(R14*1) + VMOVDQU Y3, (R15)(R14*8) MOVQ 96(R13), R15 - VMOVDQU Y4, (R15)(R14*1) + VMOVDQU Y4, (R15)(R14*8) MOVQ 120(R13), R15 - VMOVDQU Y5, (R15)(R14*1) + VMOVDQU Y5, (R15)(R14*8) MOVQ 144(R13), R15 - VMOVDQU Y6, (R15)(R14*1) + VMOVDQU Y6, (R15)(R14*8) MOVQ 168(R13), R15 - VMOVDQU Y7, (R15)(R14*1) + VMOVDQU Y7, (R15)(R14*8) MOVQ 192(R13), R15 - VMOVDQU Y8, (R15)(R14*1) + VMOVDQU Y8, (R15)(R14*8) MOVQ 216(R13), R15 - VMOVDQU Y9, (R15)(R14*1) + VMOVDQU Y9, (R15)(R14*8) // Prepare for next loop - ADDQ $0x20, R14 + ADDQ $0x04, R14 DECQ AX JNZ mulAvxTwo_9x10Xor_loop VZEROUPPER @@ -106668,6 +106710,7 @@ TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 @@ -106945,16 +106988,16 @@ mulAvxTwo_10x4_loop: // Store 4 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x4_loop VZEROUPPER @@ -107854,6 +107897,7 @@ TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 @@ -107866,28 +107910,28 @@ mulAvxTwo_10x4Xor_loop: VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 XOR3WAY( $0x00, Y5, Y6, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 @@ -108139,16 +108183,16 @@ mulAvxTwo_10x4Xor_loop: // Store 4 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x4Xor_loop VZEROUPPER @@ -108192,6 +108236,7 @@ TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 @@ -108519,18 +108564,18 @@ mulAvxTwo_10x5_loop: // Store 5 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x5_loop VZEROUPPER @@ -109538,6 +109583,7 @@ TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 @@ -109550,35 +109596,35 @@ mulAvxTwo_10x5Xor_loop: VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -109875,18 +109921,18 @@ mulAvxTwo_10x5Xor_loop: // Store 5 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x5Xor_loop VZEROUPPER @@ -109930,6 +109976,7 @@ TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 @@ -110307,20 +110354,20 @@ mulAvxTwo_10x6_loop: // Store 6 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x6_loop VZEROUPPER @@ -111436,6 +111483,7 @@ TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 @@ -111448,42 +111496,42 @@ mulAvxTwo_10x6Xor_loop: VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R14), BP - VMOVDQU (BP)(R15*1), Y5 + VMOVDQU (BP)(R15*8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -111825,20 +111873,20 @@ mulAvxTwo_10x6Xor_loop: // Store 6 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x6Xor_loop VZEROUPPER @@ -111882,6 +111930,7 @@ TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 @@ -112309,22 +112358,22 @@ mulAvxTwo_10x7_loop: // Store 7 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x7_loop VZEROUPPER @@ -113548,6 +113597,7 @@ TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 @@ -113560,49 +113610,49 @@ mulAvxTwo_10x7Xor_loop: VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R14), BP - VMOVDQU (BP)(R15*1), Y5 + VMOVDQU (BP)(R15*8), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R14), BP - VMOVDQU (BP)(R15*1), Y6 + VMOVDQU (BP)(R15*8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -113989,22 +114039,22 @@ mulAvxTwo_10x7Xor_loop: // Store 7 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x7Xor_loop VZEROUPPER @@ -114048,6 +114098,7 @@ TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 @@ -114525,24 +114576,24 @@ mulAvxTwo_10x8_loop: // Store 8 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x8_loop VZEROUPPER @@ -115874,6 +115925,7 @@ TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 @@ -115886,56 +115938,56 @@ mulAvxTwo_10x8Xor_loop: VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R14), BP - VMOVDQU (BP)(R15*1), Y5 + VMOVDQU (BP)(R15*8), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R14), BP - VMOVDQU (BP)(R15*1), Y6 + VMOVDQU (BP)(R15*8), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R14), BP - VMOVDQU (BP)(R15*1), Y7 + VMOVDQU (BP)(R15*8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -116367,24 +116419,24 @@ mulAvxTwo_10x8Xor_loop: // Store 8 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x8Xor_loop VZEROUPPER @@ -116428,6 +116480,7 @@ TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 @@ -116955,26 +117008,26 @@ mulAvxTwo_10x9_loop: // Store 9 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) MOVQ 192(R14), BP - VMOVDQU Y8, (BP)(R15*1) + VMOVDQU Y8, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x9_loop VZEROUPPER @@ -118414,6 +118467,7 @@ TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 @@ -118426,63 +118480,63 @@ mulAvxTwo_10x9Xor_loop: VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R14), BP - VMOVDQU (BP)(R15*1), Y5 + VMOVDQU (BP)(R15*8), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R14), BP - VMOVDQU (BP)(R15*1), Y6 + VMOVDQU (BP)(R15*8), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R14), BP - VMOVDQU (BP)(R15*1), Y7 + VMOVDQU (BP)(R15*8), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R14), BP - VMOVDQU (BP)(R15*1), Y8 + VMOVDQU (BP)(R15*8), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -118959,26 +119013,26 @@ mulAvxTwo_10x9Xor_loop: // Store 9 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) MOVQ 192(R14), BP - VMOVDQU Y8, (BP)(R15*1) + VMOVDQU Y8, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x9Xor_loop VZEROUPPER @@ -119022,6 +119076,7 @@ TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 @@ -119599,28 +119654,28 @@ mulAvxTwo_10x10_loop: // Store 10 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) MOVQ 192(R14), BP - VMOVDQU Y8, (BP)(R15*1) + VMOVDQU Y8, (BP)(R15*8) MOVQ 216(R14), BP - VMOVDQU Y9, (BP)(R15*1) + VMOVDQU Y9, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x10_loop VZEROUPPER @@ -121168,6 +121223,7 @@ TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX + SHRQ $0x03, R15 MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 @@ -121180,70 +121236,70 @@ mulAvxTwo_10x10Xor_loop: VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 MOVQ (R14), BP - VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (BP)(R15*8), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R14), BP - VMOVDQU (BP)(R15*1), Y1 + VMOVDQU (BP)(R15*8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R14), BP - VMOVDQU (BP)(R15*1), Y2 + VMOVDQU (BP)(R15*8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R14), BP - VMOVDQU (BP)(R15*1), Y3 + VMOVDQU (BP)(R15*8), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R14), BP - VMOVDQU (BP)(R15*1), Y4 + VMOVDQU (BP)(R15*8), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R14), BP - VMOVDQU (BP)(R15*1), Y5 + VMOVDQU (BP)(R15*8), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R14), BP - VMOVDQU (BP)(R15*1), Y6 + VMOVDQU (BP)(R15*8), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R14), BP - VMOVDQU (BP)(R15*1), Y7 + VMOVDQU (BP)(R15*8), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R14), BP - VMOVDQU (BP)(R15*1), Y8 + VMOVDQU (BP)(R15*8), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R14), BP - VMOVDQU (BP)(R15*1), Y9 + VMOVDQU (BP)(R15*8), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -121765,28 +121821,28 @@ mulAvxTwo_10x10Xor_loop: // Store 10 outputs MOVQ (R14), BP - VMOVDQU Y0, (BP)(R15*1) + VMOVDQU Y0, (BP)(R15*8) MOVQ 24(R14), BP - VMOVDQU Y1, (BP)(R15*1) + VMOVDQU Y1, (BP)(R15*8) MOVQ 48(R14), BP - VMOVDQU Y2, (BP)(R15*1) + VMOVDQU Y2, (BP)(R15*8) MOVQ 72(R14), BP - VMOVDQU Y3, (BP)(R15*1) + VMOVDQU Y3, (BP)(R15*8) MOVQ 96(R14), BP - VMOVDQU Y4, (BP)(R15*1) + VMOVDQU Y4, (BP)(R15*8) MOVQ 120(R14), BP - VMOVDQU Y5, (BP)(R15*1) + VMOVDQU Y5, (BP)(R15*8) MOVQ 144(R14), BP - VMOVDQU Y6, (BP)(R15*1) + VMOVDQU Y6, (BP)(R15*8) MOVQ 168(R14), BP - VMOVDQU Y7, (BP)(R15*1) + VMOVDQU Y7, (BP)(R15*8) MOVQ 192(R14), BP - VMOVDQU Y8, (BP)(R15*1) + VMOVDQU Y8, (BP)(R15*8) MOVQ 216(R14), BP - VMOVDQU Y9, (BP)(R15*1) + VMOVDQU Y9, (BP)(R15*8) // Prepare for next loop - ADDQ $0x20, R15 + ADDQ $0x04, R15 DECQ AX JNZ mulAvxTwo_10x10Xor_loop VZEROUPPER diff --git a/galois_gen_arm64.go b/galois_gen_arm64.go index 26045f59..a74788bf 100644 --- a/galois_gen_arm64.go +++ b/galois_gen_arm64.go @@ -1,3 +1,5 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + //go:build !noasm && !appengine && !gccgo && !nopshufb package reedsolomon @@ -121,3 +123,4 @@ func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + diff --git a/galois_gen_arm64.s b/galois_gen_arm64.s index 57984614..335b94c3 100644 --- a/galois_gen_arm64.s +++ b/galois_gen_arm64.s @@ -1,7 +1,11 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + //go:build !appengine && !noasm && !nogen && !nopshufb && gc #include "textflag.h" +// func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23,7 +27,6 @@ TEXT ·mulSve_10x1_64(SB), $0-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R14 MOVD start+72(FP), R15 @@ -45,7 +48,7 @@ TEXT ·mulSve_10x1_64(SB), $0-88 WORD $0x05e039e2 // mov z2.d, x15 WORD $0x05212042 // dup z2.b, z2.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x1_64_loop: @@ -297,6 +300,8 @@ mulSve_10x1_64_store: mulSve_10x1_64_end: RET +// func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -318,7 +323,6 @@ TEXT ·mulSve_10x1_64Xor(SB), $0-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R14 MOVD start+72(FP), R15 @@ -340,7 +344,7 @@ TEXT ·mulSve_10x1_64Xor(SB), $0-88 WORD $0x05e039e2 // mov z2.d, x15 WORD $0x05212042 // dup z2.b, z2.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x1_64Xor_loop: @@ -598,6 +602,8 @@ mulSve_10x1_64Xor_store: mulSve_10x1_64Xor_end: RET +// func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -619,7 +625,6 @@ TEXT ·mulSve_10x2_64(SB), $8-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R15 MOVD 24(R14), R14 MOVD start+72(FP), R6 @@ -643,7 +648,7 @@ TEXT ·mulSve_10x2_64(SB), $8-88 WORD $0x05e038c4 // mov z4.d, x6 WORD $0x05212084 // dup z4.b, z4.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x2_64_loop: @@ -996,6 +1001,8 @@ mulSve_10x2_64_store: mulSve_10x2_64_end: RET +// func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -1017,7 +1024,6 @@ TEXT ·mulSve_10x2_64Xor(SB), $8-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R15 MOVD 24(R14), R14 MOVD start+72(FP), R6 @@ -1041,7 +1047,7 @@ TEXT ·mulSve_10x2_64Xor(SB), $8-88 WORD $0x05e038c4 // mov z4.d, x6 WORD $0x05212084 // dup z4.b, z4.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x2_64Xor_loop: @@ -1404,6 +1410,8 @@ mulSve_10x2_64Xor_store: mulSve_10x2_64Xor_end: RET +// func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -1425,7 +1433,6 @@ TEXT ·mulSve_10x3_64(SB), $8-88 MOVD 192(R0), R12 MOVD 216(R0), R0 MOVD out_base+48(FP), R13 - MOVD out_base+48(FP), R13 MOVD (R13), R14 MOVD 24(R13), R15 MOVD 48(R13), R13 @@ -1455,7 +1462,7 @@ TEXT ·mulSve_10x3_64(SB), $8-88 MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x3_64_loop: @@ -1909,6 +1916,8 @@ mulSve_10x3_64_store: mulSve_10x3_64_end: RET +// func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -1930,7 +1939,6 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 MOVD 192(R0), R12 MOVD 216(R0), R0 MOVD out_base+48(FP), R13 - MOVD out_base+48(FP), R13 MOVD (R13), R14 MOVD 24(R13), R15 MOVD 48(R13), R13 @@ -1960,7 +1968,7 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x3_64Xor_loop: @@ -2428,6 +2436,8 @@ mulSve_10x3_64Xor_store: mulSve_10x3_64Xor_end: RET +// func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -2468,7 +2478,7 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 WORD $0x05e038c4 // mov z4.d, x6 WORD $0x05212084 // dup z4.b, z4.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x4_loop: @@ -2824,6 +2834,8 @@ mulSve_10x4_store: mulSve_10x4_end: RET +// func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -2864,7 +2876,7 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c4 // mov z4.d, x6 WORD $0x05212084 // dup z4.b, z4.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x4Xor_loop: @@ -3232,6 +3244,8 @@ mulSve_10x4Xor_store: mulSve_10x4Xor_end: RET +// func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -3272,7 +3286,7 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 WORD $0x05e038c5 // mov z5.d, x6 WORD $0x052120a5 // dup z5.b, z5.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x5_loop: @@ -3689,6 +3703,8 @@ mulSve_10x5_store: mulSve_10x5_end: RET +// func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -3729,7 +3745,7 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c5 // mov z5.d, x6 WORD $0x052120a5 // dup z5.b, z5.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x5Xor_loop: @@ -4161,6 +4177,8 @@ mulSve_10x5Xor_store: mulSve_10x5Xor_end: RET +// func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -4201,7 +4219,7 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 WORD $0x05e038c6 // mov z6.d, x6 WORD $0x052120c6 // dup z6.b, z6.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x6_loop: @@ -4679,6 +4697,8 @@ mulSve_10x6_store: mulSve_10x6_end: RET +// func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -4719,7 +4739,7 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c6 // mov z6.d, x6 WORD $0x052120c6 // dup z6.b, z6.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x6Xor_loop: @@ -5215,6 +5235,8 @@ mulSve_10x6Xor_store: mulSve_10x6Xor_end: RET +// func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -5255,7 +5277,7 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 WORD $0x05e038c7 // mov z7.d, x6 WORD $0x052120e7 // dup z7.b, z7.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x7_loop: @@ -5794,6 +5816,8 @@ mulSve_10x7_store: mulSve_10x7_end: RET +// func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -5834,7 +5858,7 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c7 // mov z7.d, x6 WORD $0x052120e7 // dup z7.b, z7.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x7Xor_loop: @@ -6394,6 +6418,8 @@ mulSve_10x7Xor_store: mulSve_10x7Xor_end: RET +// func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -6434,7 +6460,7 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 WORD $0x05e038c8 // mov z8.d, x6 WORD $0x05212108 // dup z8.b, z8.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x8_loop: @@ -7034,6 +7060,8 @@ mulSve_10x8_store: mulSve_10x8_end: RET +// func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -7074,7 +7102,7 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c8 // mov z8.d, x6 WORD $0x05212108 // dup z8.b, z8.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x8Xor_loop: @@ -7698,6 +7726,8 @@ mulSve_10x8Xor_store: mulSve_10x8Xor_end: RET +// func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -7738,7 +7768,7 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 WORD $0x05e038c9 // mov z9.d, x6 WORD $0x05212129 // dup z9.b, z9.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x9_loop: @@ -8399,6 +8429,8 @@ mulSve_10x9_store: mulSve_10x9_end: RET +// func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -8439,7 +8471,7 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 WORD $0x05e038c9 // mov z9.d, x6 WORD $0x05212129 // dup z9.b, z9.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x9Xor_loop: @@ -9127,6 +9159,8 @@ mulSve_10x9Xor_store: mulSve_10x9Xor_end: RET +// func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -9167,7 +9201,7 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 WORD $0x05e038ca // mov z10.d, x6 WORD $0x0521214a // dup z10.b, z10.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x10_loop: @@ -9889,6 +9923,8 @@ mulSve_10x10_store: mulSve_10x10_end: RET +// func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 WORD $0x25d8e3e0 // ptrue p0.d // Loading no tables to registers @@ -9929,7 +9965,7 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 WORD $0x05e038ca // mov z10.d, x6 WORD $0x0521214a // dup z10.b, z10.b[0] - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulSve_10x10Xor_loop: @@ -10681,6 +10717,8 @@ mulSve_10x10Xor_store: mulSve_10x10Xor_end: RET +// func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10701,7 +10739,6 @@ TEXT ·mulNeon_10x1_64(SB), $0-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R14 MOVD start+72(FP), R15 @@ -10723,7 +10760,7 @@ TEXT ·mulNeon_10x1_64(SB), $0-88 VMOV R15, V4.B[0] VDUP V4.B[0], V4.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x1_64_loop: @@ -11103,6 +11140,8 @@ mulNeon_10x1_64_store: mulNeon_10x1_64_end: RET +// func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11123,7 +11162,6 @@ TEXT ·mulNeon_10x1_64Xor(SB), $0-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R14 MOVD start+72(FP), R15 @@ -11145,7 +11183,7 @@ TEXT ·mulNeon_10x1_64Xor(SB), $0-88 VMOV R15, V4.B[0] VDUP V4.B[0], V4.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x1_64Xor_loop: @@ -11534,6 +11572,8 @@ mulNeon_10x1_64Xor_store: mulNeon_10x1_64Xor_end: RET +// func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -11554,7 +11594,6 @@ TEXT ·mulNeon_10x2_64(SB), $8-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R15 MOVD 24(R14), R14 MOVD start+72(FP), R6 @@ -11578,7 +11617,7 @@ TEXT ·mulNeon_10x2_64(SB), $8-88 VMOV R6, V8.B[0] VDUP V8.B[0], V8.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x2_64_loop: @@ -12136,6 +12175,8 @@ mulNeon_10x2_64_store: mulNeon_10x2_64_end: RET +// func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -12156,7 +12197,6 @@ TEXT ·mulNeon_10x2_64Xor(SB), $8-88 MOVD 192(R3), R13 MOVD 216(R3), R3 MOVD out_base+48(FP), R14 - MOVD out_base+48(FP), R14 MOVD (R14), R15 MOVD 24(R14), R14 MOVD start+72(FP), R6 @@ -12180,7 +12220,7 @@ TEXT ·mulNeon_10x2_64Xor(SB), $8-88 VMOV R6, V8.B[0] VDUP V8.B[0], V8.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x2_64Xor_loop: @@ -12754,6 +12794,8 @@ mulNeon_10x2_64Xor_store: mulNeon_10x2_64Xor_end: RET +// func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -12774,7 +12816,6 @@ TEXT ·mulNeon_10x3_64(SB), $8-88 MOVD 192(R0), R12 MOVD 216(R0), R0 MOVD out_base+48(FP), R13 - MOVD out_base+48(FP), R13 MOVD (R13), R14 MOVD 24(R13), R15 MOVD 48(R13), R13 @@ -12804,7 +12845,7 @@ TEXT ·mulNeon_10x3_64(SB), $8-88 MOVD n+80(FP), R6 LSR $6, R6 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x3_64_loop: @@ -13540,6 +13581,8 @@ mulNeon_10x3_64_store: mulNeon_10x3_64_end: RET +// func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -13560,7 +13603,6 @@ TEXT ·mulNeon_10x3_64Xor(SB), $8-88 MOVD 192(R0), R12 MOVD 216(R0), R0 MOVD out_base+48(FP), R13 - MOVD out_base+48(FP), R13 MOVD (R13), R14 MOVD 24(R13), R15 MOVD 48(R13), R13 @@ -13590,7 +13632,7 @@ TEXT ·mulNeon_10x3_64Xor(SB), $8-88 MOVD n+80(FP), R6 LSR $6, R6 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x3_64Xor_loop: @@ -14349,6 +14391,8 @@ mulNeon_10x3_64Xor_store: mulNeon_10x3_64Xor_end: RET +// func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -14387,7 +14431,7 @@ TEXT ·mulNeon_10x4(SB), NOSPLIT, $8-88 VMOV R6, V8.B[0] VDUP V8.B[0], V8.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x4_loop: @@ -14924,6 +14968,8 @@ mulNeon_10x4_store: mulNeon_10x4_end: RET +// func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -14962,7 +15008,7 @@ TEXT ·mulNeon_10x4Xor(SB), NOSPLIT, $8-88 VMOV R6, V8.B[0] VDUP V8.B[0], V8.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x4Xor_loop: @@ -15519,6 +15565,8 @@ mulNeon_10x4Xor_store: mulNeon_10x4Xor_end: RET +// func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -15557,7 +15605,7 @@ TEXT ·mulNeon_10x5(SB), NOSPLIT, $8-88 VMOV R6, V10.B[0] VDUP V10.B[0], V10.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x5_loop: @@ -16195,6 +16243,8 @@ mulNeon_10x5_store: mulNeon_10x5_end: RET +// func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -16233,7 +16283,7 @@ TEXT ·mulNeon_10x5Xor(SB), NOSPLIT, $8-88 VMOV R6, V10.B[0] VDUP V10.B[0], V10.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x5Xor_loop: @@ -16896,6 +16946,8 @@ mulNeon_10x5Xor_store: mulNeon_10x5Xor_end: RET +// func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -16934,7 +16986,7 @@ TEXT ·mulNeon_10x6(SB), NOSPLIT, $8-88 VMOV R6, V12.B[0] VDUP V12.B[0], V12.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x6_loop: @@ -17673,6 +17725,8 @@ mulNeon_10x6_store: mulNeon_10x6_end: RET +// func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -17711,7 +17765,7 @@ TEXT ·mulNeon_10x6Xor(SB), NOSPLIT, $8-88 VMOV R6, V12.B[0] VDUP V12.B[0], V12.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x6Xor_loop: @@ -18480,6 +18534,8 @@ mulNeon_10x6Xor_store: mulNeon_10x6Xor_end: RET +// func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -18518,7 +18574,7 @@ TEXT ·mulNeon_10x7(SB), NOSPLIT, $8-88 VMOV R6, V14.B[0] VDUP V14.B[0], V14.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x7_loop: @@ -19358,6 +19414,8 @@ mulNeon_10x7_store: mulNeon_10x7_end: RET +// func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -19396,7 +19454,7 @@ TEXT ·mulNeon_10x7Xor(SB), NOSPLIT, $8-88 VMOV R6, V14.B[0] VDUP V14.B[0], V14.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x7Xor_loop: @@ -20271,6 +20329,8 @@ mulNeon_10x7Xor_store: mulNeon_10x7Xor_end: RET +// func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -20309,7 +20369,7 @@ TEXT ·mulNeon_10x8(SB), NOSPLIT, $8-88 VMOV R6, V16.B[0] VDUP V16.B[0], V16.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x8_loop: @@ -21250,6 +21310,8 @@ mulNeon_10x8_store: mulNeon_10x8_end: RET +// func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -21288,7 +21350,7 @@ TEXT ·mulNeon_10x8Xor(SB), NOSPLIT, $8-88 VMOV R6, V16.B[0] VDUP V16.B[0], V16.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x8Xor_loop: @@ -22269,6 +22331,8 @@ mulNeon_10x8Xor_store: mulNeon_10x8Xor_end: RET +// func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -22307,7 +22371,7 @@ TEXT ·mulNeon_10x9(SB), NOSPLIT, $8-88 VMOV R6, V18.B[0] VDUP V18.B[0], V18.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x9_loop: @@ -23349,6 +23413,8 @@ mulNeon_10x9_store: mulNeon_10x9_end: RET +// func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -23387,7 +23453,7 @@ TEXT ·mulNeon_10x9Xor(SB), NOSPLIT, $8-88 VMOV R6, V18.B[0] VDUP V18.B[0], V18.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x9Xor_loop: @@ -24474,6 +24540,8 @@ mulNeon_10x9Xor_store: mulNeon_10x9Xor_end: RET +// func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -24512,7 +24580,7 @@ TEXT ·mulNeon_10x10(SB), NOSPLIT, $8-88 VMOV R6, V20.B[0] VDUP V20.B[0], V20.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x10_loop: @@ -25655,6 +25723,8 @@ mulNeon_10x10_store: mulNeon_10x10_end: RET +// func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON TEXT ·mulNeon_10x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -25693,7 +25763,7 @@ TEXT ·mulNeon_10x10Xor(SB), NOSPLIT, $8-88 VMOV R6, V20.B[0] VDUP V20.B[0], V20.B16 - // Load number of inputs shards + // Load number of input shards MOVD in_len+32(FP), R16 mulNeon_10x10Xor_loop: @@ -26885,3 +26955,4 @@ mulNeon_10x10Xor_store: mulNeon_10x10Xor_end: RET +