diff --git a/_gen/gen.go b/_gen/gen.go index b17eb0d0..30b71780 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -1,5 +1,4 @@ //go:build generate -// +build generate // Copyright 2022+, Klaus Post. See LICENSE for details. @@ -110,7 +109,6 @@ import ( ) `) - w.WriteString(fmt.Sprintf("const pshufb = %v\n\n", pshufb)) w.WriteString(fmt.Sprintf(`const ( avx2CodeGen = true maxAvx2Inputs = %d diff --git a/galois_amd64.go b/galois_amd64.go index 23adfa42..c7ab3663 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -4,6 +4,8 @@ package reedsolomon +const pshufb = true + //go:noescape func galMulSSSE3(low, high, in, out []byte) diff --git a/galois_arm64.go b/galois_arm64.go index 0479c902..8ef402bf 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -1,19 +1,18 @@ -//go:build !noasm && !appengine && !gccgo && pshufb +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulNEON(low, high, in, out []byte) //go:noescape func galMulXorNEON(low, high, in, out []byte) -//go:noescape -func galXorNEON(in, out []byte) - func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { copy(out, in) @@ -50,20 +49,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - - galXorNEON(in, out) - done := (len(in) >> 5) << 5 - - remain := len(in) - done - if remain > 0 { - for i := done; i < len(in); i++ { - out[i] ^= in[i] - } - } -} - // 4-way butterfly func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) @@ -89,7 +74,7 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) { // Reference version: refMulAdd(x, y, log_m) // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) } // 2-way butterfly forward @@ -102,7 +87,7 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) { // 2-way butterfly func ifftDIT2(x, y []byte, log_m ffe, o *options) { // 64 byte aligned, always full. - galXorNEON(x, y) + xorSliceNEON(x, y) // Reference version: refMulAdd(x, y, log_m) } diff --git a/galois_arm64.s b/galois_arm64.s index 5c042bea..772dfac9 100644 --- a/galois_arm64.s +++ b/galois_arm64.s @@ -1,7 +1,7 @@ //+build !noasm //+build !appengine //+build !gccgo -//+build !pshufb +//+build !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. @@ -100,29 +100,3 @@ loopXor: completeXor: RET - -// func galXorNEON(in, out []byte) -TEXT ·galXorNEON(SB), 7, $0 - MOVD in_base+0(FP), R1 - MOVD in_len+8(FP), R2 // length of message - MOVD out_base+24(FP), R5 - SUBS $32, R2 - BMI completeXor - -loopXor: - // Main loop - VLD1.P 32(R1), [V0.B16, V1.B16] - VLD1 (R5), [V20.B16, V21.B16] - - VEOR V20.B16, V0.B16, V4.B16 - VEOR V21.B16, V1.B16, V5.B16 - - // Store result - VST1.P [V4.D2, V5.D2], 32(R5) - - SUBS $32, R2 - BPL loopXor - -completeXor: - RET - diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index d3c1fa66..28c50658 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -9,8 +9,6 @@ import ( "fmt" ) -const pshufb = true - const ( avx2CodeGen = true maxAvx2Inputs = 10 diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go index a900c2b9..888df307 100644 --- a/galois_gen_switch_nopshufb_amd64.go +++ b/galois_gen_switch_nopshufb_amd64.go @@ -9,8 +9,6 @@ import ( "fmt" ) -const pshufb = false - const ( avx2CodeGen = true maxAvx2Inputs = 10 diff --git a/galois_noasm.go b/galois_noasm.go index 04d184a4..fb5a3b65 100644 --- a/galois_noasm.go +++ b/galois_noasm.go @@ -4,6 +4,8 @@ package reedsolomon +const pshufb = false + func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { @@ -28,11 +30,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } } -// simple slice xor -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) -} - func init() { defaultOptions.useAVX512 = false } diff --git a/galois_nopshufb_amd64.go b/galois_nopshufb_amd64.go index 06b32d88..707eb96f 100644 --- a/galois_nopshufb_amd64.go +++ b/galois_nopshufb_amd64.go @@ -7,6 +7,8 @@ package reedsolomon // bigSwitchover is the size where 64 bytes are processed per loop. const bigSwitchover = 128 +const pshufb = false + // simple slice xor func sliceXor(in, out []byte, o *options) { if o.useSSE2 { diff --git a/galois_ppc64le.go b/galois_ppc64le.go index 01458b5b..31b5da9d 100644 --- a/galois_ppc64le.go +++ b/galois_ppc64le.go @@ -1,10 +1,12 @@ -//go:build !noasm && !appengine && !gccgo && !pshufb +//go:build !noasm && !appengine && !gccgo && !nopshufb // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. package reedsolomon +const pshufb = true + //go:noescape func galMulPpc(low, high, in, out []byte) diff --git a/reedsolomon.go b/reedsolomon.go index 3fad26f9..75ffc4ff 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -652,7 +652,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) { + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useGFNI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] @@ -803,7 +803,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useAVX2 && + return avx2CodeGen && pshufb && r.o.useAVX2 && byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } diff --git a/xor_arm64.go b/xor_arm64.go new file mode 100644 index 00000000..6f0522f8 --- /dev/null +++ b/xor_arm64.go @@ -0,0 +1,19 @@ +//go:build !noasm && !appengine && !gccgo + +package reedsolomon + +//go:noescape +func xorSliceNEON(in, out []byte) + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + xorSliceNEON(in, out) + done := (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + for i := done; i < len(in); i++ { + out[i] ^= in[i] + } + } +} diff --git a/xor_arm64.s b/xor_arm64.s new file mode 100644 index 00000000..56298731 --- /dev/null +++ b/xor_arm64.s @@ -0,0 +1,29 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// func xorSliceNEON(in, out []byte) +TEXT ·xorSliceNEON(SB), 7, $0 + MOVD in_base+0(FP), R1 + MOVD in_len+8(FP), R2 // length of message + MOVD out_base+24(FP), R5 + SUBS $32, R2 + BMI completeXor + +loopXor: + // Main loop + VLD1.P 32(R1), [V0.B16, V1.B16] + VLD1 (R5), [V20.B16, V21.B16] + + VEOR V20.B16, V0.B16, V4.B16 + VEOR V21.B16, V1.B16, V5.B16 + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loopXor + +completeXor: + RET + diff --git a/xor_noasm.go b/xor_noasm.go new file mode 100644 index 00000000..f1336678 --- /dev/null +++ b/xor_noasm.go @@ -0,0 +1,7 @@ +//go:build noasm || gccgo || appengine || (ppc64le && nopshufb) + +package reedsolomon + +func sliceXor(in, out []byte, o *options) { + sliceXorGo(in, out, o) +}