Skip to content

Commit

Permalink
Support for ARM SVE and better performance for NEON
Browse files Browse the repository at this point in the history
  • Loading branch information
fwessels committed May 31, 2024
1 parent d4574a5 commit 7caecde
Show file tree
Hide file tree
Showing 5 changed files with 27,356 additions and 3 deletions.
6 changes: 3 additions & 3 deletions galois.go
Original file line number Diff line number Diff line change
Expand Up @@ -911,9 +911,9 @@ func galExp(a byte, n int) byte {
}

func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
if !avx2CodeGen {
panic("codegen not enabled")
}
// if !avx2CodeGen {
// panic("codegen not enabled")
// }
total := inputs * outputs

// Duplicated in+out
Expand Down
161 changes: 161 additions & 0 deletions galois_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

package reedsolomon

import (
"fmt"
)

const pshufb = true

//go:noescape
Expand Down Expand Up @@ -49,6 +53,163 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
}
}

// galMulSlicesSve
func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
n := stop - start

// fmt.Println(len(in), len(out))
switch len(out) {
case 1:
mulSve_10x1_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 2:
mulSve_10x2_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 3:
mulSve_10x3_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 4:
mulSve_10x4(matrix, in, out, start, n)
return n & (maxInt - 31)
case 5:
mulSve_10x5(matrix, in, out, start, n)
return n & (maxInt - 31)
case 6:
mulSve_10x6(matrix, in, out, start, n)
return n & (maxInt - 31)
case 7:
mulSve_10x7(matrix, in, out, start, n)
return n & (maxInt - 31)
case 8:
mulSve_10x8(matrix, in, out, start, n)
return n & (maxInt - 31)
case 9:
mulSve_10x9(matrix, in, out, start, n)
return n & (maxInt - 31)
case 10:
mulSve_10x10(matrix, in, out, start, n)
return n & (maxInt - 31)
}
panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
}

// galMulSlicesSveXor
func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
n := (stop - start)

switch len(out) {
case 1:
mulSve_10x1_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 2:
mulSve_10x2_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 3:
mulSve_10x3_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 4:
mulSve_10x4Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 5:
mulSve_10x5Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 6:
mulSve_10x6Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 7:
mulSve_10x7Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 8:
mulSve_10x8Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 9:
mulSve_10x9Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 10:
mulSve_10x10Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
}
panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
}

// galMulSlicesNeon
func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
n := stop - start

switch len(out) {
case 1:
mulNeon_10x1_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 2:
mulNeon_10x2_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 3:
mulNeon_10x3_64(matrix, in, out, start, n)
return n & (maxInt - 63)
case 4:
mulNeon_10x4(matrix, in, out, start, n)
return n & (maxInt - 31)
case 5:
mulNeon_10x5(matrix, in, out, start, n)
return n & (maxInt - 31)
case 6:
mulNeon_10x6(matrix, in, out, start, n)
return n & (maxInt - 31)
case 7:
mulNeon_10x7(matrix, in, out, start, n)
return n & (maxInt - 31)
case 8:
mulNeon_10x8(matrix, in, out, start, n)
return n & (maxInt - 31)
case 9:
mulNeon_10x9(matrix, in, out, start, n)
return n & (maxInt - 31)
case 10:
mulNeon_10x10(matrix, in, out, start, n)
return n & (maxInt - 31)
}
panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
}

// galMulSlicesNeonXor
func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
n := (stop - start)

switch len(out) {
case 1:
mulNeon_10x1_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 2:
mulNeon_10x2_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 3:
mulNeon_10x3_64Xor(matrix, in, out, start, n)
return n & (maxInt - 63)
case 4:
mulNeon_10x4Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 5:
mulNeon_10x5Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 6:
mulNeon_10x6Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 7:
mulNeon_10x7Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 8:
mulNeon_10x8Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 9:
mulNeon_10x9Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
case 10:
mulNeon_10x10Xor(matrix, in, out, start, n)
return n & (maxInt - 31)
}
panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
}

// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
Expand Down
123 changes: 123 additions & 0 deletions galois_gen_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
//go:build !noasm && !appengine && !gccgo && !nopshufb

package reedsolomon

//go:noescape
func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)

//go:noescape
func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
Loading

0 comments on commit 7caecde

Please sign in to comment.