From c28d6bba68f8245000c872b87aa5d0895a170033 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 6 Apr 2023 07:43:04 -0700 Subject: [PATCH] EncodeIdx: Use advanced AVX2/GFNI (#248) * EncodeIdx: Use advanced AVX2/GFNI Speeds up many output EncodeIdx. Example in #247 twice as fast. Fixes #247 #245 --- reedsolomon.go | 58 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/reedsolomon.go b/reedsolomon.go index 9775b10a..20e39748 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -477,11 +477,17 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Calculate what we want per round r.o.perRound = cpuid.CPU.Cache.L2 + if r.o.perRound < 128<<10 { + r.o.perRound = 128 << 10 + } divide := parityShards + 1 if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { // Base on L1 cache if we have many inputs. r.o.perRound = cpuid.CPU.Cache.L1D + if r.o.perRound < 32<<10 { + r.o.perRound = 32 << 10 + } divide = 0 if dataShards > maxAvx2Inputs { divide += maxAvx2Inputs @@ -495,11 +501,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } } - if r.o.perRound <= 0 { - // Set to 128K if undetectable. - r.o.perRound = 128 << 10 - } - if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores { // If multiple threads per core, make sure they don't contend for cache. r.o.perRound /= cpuid.CPU.ThreadsPerCore @@ -510,6 +511,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Align to 64 bytes. r.o.perRound = ((r.o.perRound + 63) / 64) * 64 + // Final sanity check... + if r.o.perRound < 1<<10 { + r.o.perRound = 1 << 10 + } + if r.o.minSplitSize <= 0 { // Set minsplit as high as we can, but still have parity in L1. cacheSize := cpuid.CPU.Cache.L1D @@ -646,6 +652,19 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) { + m := make([][]byte, r.parityShards) + for iRow := range m { + m[iRow] = r.parity[iRow][idx : idx+1] + } + if r.o.useGFNI { + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + } else { + r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) + } + return nil + } + // Process using no goroutines for now. start, end := 0, r.o.perRound if end > len(dataShard) { @@ -909,16 +928,16 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) defer r.putTmpSlice(avx2Matrix) } else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount) + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) return } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount) + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true) return } @@ -982,7 +1001,8 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte // Perform the same as codeSomeShards, but split the workload into // several goroutines. -func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) { +// If clear is set, the first write will overwrite the output. +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1022,7 +1042,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b input: inPer, output: outPer, m: m, - first: inIdx == 0, + first: inIdx == 0 && clear, }) outIdx += len(outPer) outs = outs[len(outPer):] @@ -1054,7 +1074,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b input: inPer, output: outPer, m: m, - first: inIdx == 0, + first: inIdx == 0 && clear, }) inIdx += len(inPer) ins = ins[len(inPer):] @@ -1070,6 +1090,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b } exec := func(start, stop int) { + defer wg.Done() lstart, lstop := start, start+r.o.perRound if lstop > stop { lstop = stop @@ -1097,7 +1118,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b for c := range inputs { in := inputs[c][lstart:lstop] for iRow := 0; iRow < len(outputs); iRow++ { - if c == 0 { + if c == 0 && clear { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) } else { galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) @@ -1110,7 +1131,6 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b lstop = stop } } - wg.Done() } if gor == 1 { wg.Add(1) @@ -1135,7 +1155,8 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b // Perform the same as codeSomeShards, but split the workload into // several goroutines. -func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int) { +// If clear is set, the first write will overwrite the output. +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1171,7 +1192,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b input: inPer, output: outPer, m: m, - first: inIdx == 0, + first: inIdx == 0 && clear, }) outIdx += len(outPer) outs = outs[len(outPer):] @@ -1202,7 +1223,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b input: inPer, output: outPer, m: m, - first: inIdx == 0, + first: inIdx == 0 && clear, }) inIdx += len(inPer) ins = ins[len(inPer):] @@ -1218,6 +1239,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b } exec := func(start, stop int) { + defer wg.Done() lstart, lstop := start, start+r.o.perRound if lstop > stop { lstop = stop @@ -1245,7 +1267,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b for c := range inputs { in := inputs[c][lstart:lstop] for iRow := 0; iRow < len(outputs); iRow++ { - if c == 0 { + if c == 0 && clear { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) } else { galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) @@ -1258,8 +1280,8 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b lstop = stop } } - wg.Done() } + if gor == 1 { wg.Add(1) exec(0, byteCount)