From 8b6d68a23e83e7b79b6e34e1d3ec3a2d388bad63 Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Wed, 5 Jun 2024 14:57:01 -0700 Subject: [PATCH] Refactor canGFNI into architecture specific versions and minor changes --- galois_gen_none.go | 18 ++------ galois_gen_switch_amd64.go | 19 +++++++- galois_gen_switch_arm64.go | 20 ++------- galois_gen_switch_nopshufb_amd64.go | 43 +++++++++--------- galois_gen_switch_nopshufb_arm64.go | 22 +++++++++ reedsolomon.go | 69 ++++++++--------------------- 6 files changed, 88 insertions(+), 103 deletions(-) create mode 100644 galois_gen_switch_nopshufb_arm64.go diff --git a/galois_gen_none.go b/galois_gen_none.go index 20323f0..3e25898 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -10,22 +10,10 @@ const ( minCodeGenSize = 1 ) -func (r *reedSolomon) hasCodeGen(_ int, _, _ int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { +func (r *reedSolomon) hasCodeGen(int, int, int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { return nil, nil, false } -func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) canGFNI(int, int, int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index c9035a9..d4f46ea 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -18,8 +18,12 @@ const ( ) var ( - fAvx2 = galMulSlicesAvx2 - fAvx2Xor = galMulSlicesAvx2Xor + fAvx2 = galMulSlicesAvx2 + fAvx2Xor = galMulSlicesAvx2Xor + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor ) func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { @@ -28,6 +32,17 @@ func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs } +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := stop - start diff --git a/galois_gen_switch_arm64.go b/galois_gen_switch_arm64.go index e4cc5db..ff2541b 100644 --- a/galois_gen_switch_arm64.go +++ b/galois_gen_switch_arm64.go @@ -33,6 +33,10 @@ func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs } +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + // galMulSlicesSve func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { n := stop - start @@ -189,19 +193,3 @@ func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int { } panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) } - -func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go index de35877..75ecddf 100644 --- a/galois_gen_switch_nopshufb_amd64.go +++ b/galois_gen_switch_nopshufb_amd64.go @@ -10,16 +10,35 @@ import ( ) const ( - codeGen = true - codeGenMaxInputs = 10 - codeGenMinOutputs = 10 - minCodeGenSize = 64 + codeGen = true + codeGenMaxGoroutines = 4 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +var ( + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor ) func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { return nil, nil, false // no code generation for generic case (only GFNI cases) } +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (64 - 1)) @@ -1371,19 +1390,3 @@ func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } - -func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} diff --git a/galois_gen_switch_nopshufb_arm64.go b/galois_gen_switch_nopshufb_arm64.go new file mode 100644 index 0000000..db2aaa6 --- /dev/null +++ b/galois_gen_switch_nopshufb_arm64.go @@ -0,0 +1,22 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +const ( + codeGen = false + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} diff --git a/reedsolomon.go b/reedsolomon.go index 4c274f8..3b6f5b7 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -560,7 +560,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.o.maxGoroutines = codeGenMaxGoroutines } - if r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { + if _, _, useGFNI := r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { r.o.maxGoroutines = gfniCodeGenMaxGoroutines } @@ -660,7 +660,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro m[iRow] = r.parity[iRow][idx : idx+1] } if r.o.useAvx512GFNI || r.o.useAvxGNFI { - r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } else { r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } @@ -804,12 +804,6 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil } -func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return codeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) && - byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && - inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs -} - // Multiplies a subset of rows from a coding matrix by a full set of // input totalShards to produce some output totalShards. // 'matrixRows' is The rows from the matrix to use. @@ -833,14 +827,10 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if end > len(inputs[0]) { end = len(inputs[0]) } - if r.canGFNI(byteCount, len(inputs), len(outputs)) { + if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)); useGFNI { var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) - } else { - start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount) - } + start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) @@ -866,19 +856,12 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if len(outPer) > codeGenMaxOutputs { outPer = outPer[:codeGenMaxOutputs] } - if r.o.useAvx512GFNI { - m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) - if inIdx == 0 { - start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) - } else { - start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) - } - } else if r.o.useAvxGNFI { + if useGFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNI)(m, inPer, outPer, 0, byteCount) } else { - start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) @@ -926,18 +909,18 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var genMatrix []byte var gfniMatrix []uint64 galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs)) - useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + galMulGFNI, _, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) if useGFNI { var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) } else if useCodeGen { genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) defer r.putTmpSlice(genMatrix) - } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards && - r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs) { + } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true, galMulGFNI, galMulGFNIXor) return } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { @@ -955,11 +938,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) - } else { - start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop) - } + start += (*galMulGFNI)(gfniMatrix, inputs, outputs, start, stop) } else if useCodeGen { start += (*galMulGen)(genMatrix, inputs, outputs, start, stop) } @@ -1167,7 +1146,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGFNI, galMulGFNIXor *func(matrix []uint64, in, out [][]byte, start, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1256,24 +1235,14 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minCodeGenSize { + if galMulGFNI != nil && galMulGFNIXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... var n int - if r.o.useAvx512GFNI { - for _, p := range plan { - if p.first { - n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) - } - } - } else { - for _, p := range plan { - if p.first { - n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop) - } + for _, p := range plan { + if p.first { + n = (*galMulGFNI)(p.m, p.input, p.output, lstart, lstop) + } else { + n = (*galMulGFNIXor)(p.m, p.input, p.output, lstart, lstop) } } lstart += n