Skip to content

Commit

Permalink
Refactor canGFNI into architecture specific versions and minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
fwessels committed Jun 5, 2024
1 parent e65e786 commit 8b6d68a
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 103 deletions.
18 changes: 3 additions & 15 deletions galois_gen_none.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,10 @@ const (
minCodeGenSize = 1
)

func (r *reedSolomon) hasCodeGen(_ int, _, _ int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
func (r *reedSolomon) hasCodeGen(int, int, int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
return nil, nil, false
}

func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
func (r *reedSolomon) canGFNI(int, int, int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) {
return nil, nil, false
}
19 changes: 17 additions & 2 deletions galois_gen_switch_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 4 additions & 16 deletions galois_gen_switch_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func
inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
}

func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) {
return nil, nil, false
}

// galMulSlicesSve
func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
n := stop - start
Expand Down Expand Up @@ -189,19 +193,3 @@ func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
}
panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
}

func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}

func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
panic("codegen not available")
}
43 changes: 23 additions & 20 deletions galois_gen_switch_nopshufb_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions galois_gen_switch_nopshufb_arm64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 19 additions & 50 deletions reedsolomon.go
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
r.o.maxGoroutines = codeGenMaxGoroutines
}

if r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines {
if _, _, useGFNI := r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && r.o.maxGoroutines > gfniCodeGenMaxGoroutines {
r.o.maxGoroutines = gfniCodeGenMaxGoroutines
}

Expand Down Expand Up @@ -660,7 +660,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro
m[iRow] = r.parity[iRow][idx : idx+1]
}
if r.o.useAvx512GFNI || r.o.useAvxGNFI {
r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false)
r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil)
} else {
r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil)
}
Expand Down Expand Up @@ -804,12 +804,6 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil
}

func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool {
return codeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) &&
byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards &&
inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
}

// Multiplies a subset of rows from a coding matrix by a full set of
// input totalShards to produce some output totalShards.
// 'matrixRows' is The rows from the matrix to use.
Expand All @@ -833,14 +827,10 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
if end > len(inputs[0]) {
end = len(inputs[0])
}
if r.canGFNI(byteCount, len(inputs), len(outputs)) {
if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)); useGFNI {
var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64
m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:])
if r.o.useAvx512GFNI {
start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount)
} else {
start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount)
}
start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount)
end = len(inputs[0])
} else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok {
m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
Expand All @@ -866,19 +856,12 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
if len(outPer) > codeGenMaxOutputs {
outPer = outPer[:codeGenMaxOutputs]
}
if r.o.useAvx512GFNI {
m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:])
if inIdx == 0 {
start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount)
} else {
start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount)
}
} else if r.o.useAvxGNFI {
if useGFNI {
m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:])
if inIdx == 0 {
start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount)
start = (*galMulGFNI)(m, inPer, outPer, 0, byteCount)
} else {
start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount)
start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount)
}
} else {
m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
Expand Down Expand Up @@ -926,18 +909,18 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
var genMatrix []byte
var gfniMatrix []uint64
galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs))
useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs))
galMulGFNI, _, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs))
if useGFNI {
var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64
gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:])
} else if useCodeGen {
genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
defer r.putTmpSlice(genMatrix)
} else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards &&
r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs) {
} else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI &&
byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true)
r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true, galMulGFNI, galMulGFNIXor)
return
} else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok &&
byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards {
Expand All @@ -955,11 +938,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
exec := func(start, stop int) {
if stop-start >= 64 {
if useGFNI {
if r.o.useAvx512GFNI {
start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop)
} else {
start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop)
}
start += (*galMulGFNI)(gfniMatrix, inputs, outputs, start, stop)
} else if useCodeGen {
start += (*galMulGen)(genMatrix, inputs, outputs, start, stop)
}
Expand Down Expand Up @@ -1167,7 +1146,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
// If clear is set, the first write will overwrite the output.
func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGFNI, galMulGFNIXor *func(matrix []uint64, in, out [][]byte, start, stop int) int) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines

Expand Down Expand Up @@ -1256,24 +1235,14 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
lstop = stop
}
for lstart < stop {
if lstop-lstart >= minCodeGenSize {
if galMulGFNI != nil && galMulGFNIXor != nil && lstop-lstart >= minCodeGenSize {
// Execute plan...
var n int
if r.o.useAvx512GFNI {
for _, p := range plan {
if p.first {
n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop)
} else {
n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop)
}
}
} else {
for _, p := range plan {
if p.first {
n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop)
} else {
n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop)
}
for _, p := range plan {
if p.first {
n = (*galMulGFNI)(p.m, p.input, p.output, lstart, lstop)
} else {
n = (*galMulGFNIXor)(p.m, p.input, p.output, lstart, lstop)
}
}
lstart += n
Expand Down

0 comments on commit 8b6d68a

Please sign in to comment.