From fcb2dea4915455990a1f27c6e94c88823640c247 Mon Sep 17 00:00:00 2001
From: Frank Wessels <fwessels@xs4all.nl>
Date: Tue, 4 Jun 2024 14:52:29 -0700
Subject: [PATCH] Code refactoring to support code generation for both amd64 as
 well as arm64

---
 galois.go                           |  10 +-
 galois_amd64_test.go                |  13 ++
 galois_arm64.go                     | 161 ----------------------
 galois_arm64_test.go                |  17 +++
 galois_gen_none.go                  |  22 ++-
 galois_gen_switch_amd64.go          |  16 ++-
 galois_gen_switch_arm64.go          | 202 ++++++++++++++++++++++++++++
 galois_gen_switch_nopshufb_amd64.go |  29 +++-
 galois_test.go                      |  72 +++++-----
 options.go                          |  11 +-
 reedsolomon.go                      | 143 ++++++++++----------
 11 files changed, 391 insertions(+), 305 deletions(-)
 create mode 100644 galois_amd64_test.go
 create mode 100644 galois_arm64_test.go
 create mode 100644 galois_gen_switch_arm64.go

diff --git a/galois.go b/galois.go
index 466ced2b..9b363950 100644
--- a/galois.go
+++ b/galois.go
@@ -910,10 +910,10 @@ func galExp(a byte, n int) byte {
 	return expTable[uint8(logResult)]
 }
 
-func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
-	// if !avx2CodeGen {
-	// 	panic("codegen not enabled")
-	// }
+func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
+	if !codeGen {
+		panic("codegen not enabled")
+	}
 	total := inputs * outputs
 
 	// Duplicated in+out
@@ -942,7 +942,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte)
 var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f}
 
 func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 {
-	if !avx2CodeGen {
+	if !codeGen {
 		panic("codegen not enabled")
 	}
 	total := inputs * outputs
diff --git a/galois_amd64_test.go b/galois_amd64_test.go
new file mode 100644
index 00000000..55094bef
--- /dev/null
+++ b/galois_amd64_test.go
@@ -0,0 +1,13 @@
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+import (
+	"testing"
+)
+
+func TestGenGalois(t *testing.T) {
+	if defaultOptions.useAVX2 {
+		testGenGaloisUpto10x10(t, galMulSlicesAvx2, galMulSlicesAvx2Xor)
+	}
+}
diff --git a/galois_arm64.go b/galois_arm64.go
index 1e636466..8ef402bf 100644
--- a/galois_arm64.go
+++ b/galois_arm64.go
@@ -5,10 +5,6 @@
 
 package reedsolomon
 
-import (
-	"fmt"
-)
-
 const pshufb = true
 
 //go:noescape
@@ -53,163 +49,6 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 	}
 }
 
-// galMulSlicesSve
-func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := stop - start
-
-	// fmt.Println(len(in), len(out))
-	switch len(out) {
-	case 1:
-		mulSve_10x1_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 2:
-		mulSve_10x2_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 3:
-		mulSve_10x3_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 4:
-		mulSve_10x4(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 5:
-		mulSve_10x5(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 6:
-		mulSve_10x6(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 7:
-		mulSve_10x7(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 8:
-		mulSve_10x8(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 9:
-		mulSve_10x9(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 10:
-		mulSve_10x10(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	}
-	panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
-}
-
-// galMulSlicesSveXor
-func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := (stop - start)
-
-	switch len(out) {
-	case 1:
-		mulSve_10x1_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 2:
-		mulSve_10x2_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 3:
-		mulSve_10x3_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 4:
-		mulSve_10x4Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 5:
-		mulSve_10x5Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 6:
-		mulSve_10x6Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 7:
-		mulSve_10x7Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 8:
-		mulSve_10x8Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 9:
-		mulSve_10x9Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 10:
-		mulSve_10x10Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	}
-	panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
-}
-
-// galMulSlicesNeon
-func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := stop - start
-
-	switch len(out) {
-	case 1:
-		mulNeon_10x1_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 2:
-		mulNeon_10x2_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 3:
-		mulNeon_10x3_64(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 4:
-		mulNeon_10x4(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 5:
-		mulNeon_10x5(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 6:
-		mulNeon_10x6(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 7:
-		mulNeon_10x7(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 8:
-		mulNeon_10x8(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 9:
-		mulNeon_10x9(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 10:
-		mulNeon_10x10(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	}
-	panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
-}
-
-// galMulSlicesNeonXor
-func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := (stop - start)
-
-	switch len(out) {
-	case 1:
-		mulNeon_10x1_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 2:
-		mulNeon_10x2_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 3:
-		mulNeon_10x3_64Xor(matrix, in, out, start, n)
-		return n & (maxInt - 63)
-	case 4:
-		mulNeon_10x4Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 5:
-		mulNeon_10x5Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 6:
-		mulNeon_10x6Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 7:
-		mulNeon_10x7Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 8:
-		mulNeon_10x8Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 9:
-		mulNeon_10x9Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	case 10:
-		mulNeon_10x10Xor(matrix, in, out, start, n)
-		return n & (maxInt - 31)
-	}
-	panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
-}
-
 // 4-way butterfly
 func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
 	ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
diff --git a/galois_arm64_test.go b/galois_arm64_test.go
new file mode 100644
index 00000000..dd27bb60
--- /dev/null
+++ b/galois_arm64_test.go
@@ -0,0 +1,17 @@
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2024, Minio, Inc.
+
+package reedsolomon
+
+import (
+	"testing"
+)
+
+func TestGenGalois(t *testing.T) {
+	if defaultOptions.useSVE {
+		testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor)
+	}
+	if defaultOptions.useNEON {
+		testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor)
+	}
+}
diff --git a/galois_gen_none.go b/galois_gen_none.go
index 1bb268a3..20323f08 100644
--- a/galois_gen_none.go
+++ b/galois_gen_none.go
@@ -1,19 +1,17 @@
-//go:build !amd64 || noasm || appengine || gccgo || nogen
+//go:build !(amd64 || arm64) || noasm || appengine || gccgo || nogen
 
 package reedsolomon
 
-const maxAvx2Inputs = 1
-const maxAvx2Outputs = 1
-const minAvx2Size = 1
-const avxSizeMask = 0
-const avx2CodeGen = false
+const (
+	codeGen              = false
+	codeGenMaxGoroutines = 8
+	codeGenMaxInputs     = 1
+	codeGenMaxOutputs    = 1
+	minCodeGenSize       = 1
+)
 
-func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
-	panic("codegen not available")
-}
-
-func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
-	panic("codegen not available")
+func (r *reedSolomon) hasCodeGen(_ int, _, _ int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
+	return nil, nil, false
 }
 
 func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go
index 429e2c20..2826a129 100644
--- a/galois_gen_switch_amd64.go
+++ b/galois_gen_switch_amd64.go
@@ -10,12 +10,20 @@ import (
 )
 
 const (
-	avx2CodeGen    = true
-	maxAvx2Inputs  = 10
-	maxAvx2Outputs = 10
-	minAvx2Size    = 64
+	codeGen    = true
+	codeGenMaxGoroutines = 8
+	codeGenMaxInputs  = 10
+	codeGenMaxOutputs = 10
+	minCodeGenSize    = 64
 )
 
+func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
+	f, fXor := galMulSlicesAvx2, galMulSlicesAvx2Xor
+	return &f, &fXor, codeGen && pshufb && r.o.useAVX2 &&
+		byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards &&
+		inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
+}
+
 func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
 	n := stop - start
 
diff --git a/galois_gen_switch_arm64.go b/galois_gen_switch_arm64.go
new file mode 100644
index 00000000..3e912a99
--- /dev/null
+++ b/galois_gen_switch_arm64.go
@@ -0,0 +1,202 @@
+//go:build !appengine && !noasm && gc && !nogen && !nopshufb
+// +build !appengine,!noasm,gc,!nogen,!nopshufb
+
+package reedsolomon
+
+import (
+	"fmt"
+)
+
+const (
+	codeGen              = true
+	codeGenMaxGoroutines = 16
+	codeGenMaxInputs     = 10
+	codeGenMaxOutputs    = 10
+	minCodeGenSize       = 64
+)
+
+func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
+	if r.o.useSVE {
+		f, fXor := galMulSlicesSve, galMulSlicesSveXor
+		return &f, &fXor, codeGen && pshufb &&
+			byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards &&
+			inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
+	}
+	f, fXor := galMulSlicesNeon, galMulSlicesNeonXor
+	return &f, &fXor, codeGen && pshufb && r.o.useNEON &&
+		byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards &&
+		inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
+}
+
+// galMulSlicesSve
+func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
+	n := stop - start
+
+	// fmt.Println(len(in), len(out))
+	switch len(out) {
+	case 1:
+		mulSve_10x1_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 2:
+		mulSve_10x2_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 3:
+		mulSve_10x3_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 4:
+		mulSve_10x4(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 5:
+		mulSve_10x5(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 6:
+		mulSve_10x6(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 7:
+		mulSve_10x7(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 8:
+		mulSve_10x8(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 9:
+		mulSve_10x9(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 10:
+		mulSve_10x10(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	}
+	panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
+}
+
+// galMulSlicesSveXor
+func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
+	n := (stop - start)
+
+	switch len(out) {
+	case 1:
+		mulSve_10x1_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 2:
+		mulSve_10x2_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 3:
+		mulSve_10x3_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 4:
+		mulSve_10x4Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 5:
+		mulSve_10x5Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 6:
+		mulSve_10x6Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 7:
+		mulSve_10x7Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 8:
+		mulSve_10x8Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 9:
+		mulSve_10x9Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 10:
+		mulSve_10x10Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	}
+	panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out)))
+}
+
+// galMulSlicesNeon
+func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
+	n := stop - start
+
+	switch len(out) {
+	case 1:
+		mulNeon_10x1_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 2:
+		mulNeon_10x2_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 3:
+		mulNeon_10x3_64(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 4:
+		mulNeon_10x4(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 5:
+		mulNeon_10x5(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 6:
+		mulNeon_10x6(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 7:
+		mulNeon_10x7(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 8:
+		mulNeon_10x8(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 9:
+		mulNeon_10x9(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 10:
+		mulNeon_10x10(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	}
+	panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
+}
+
+// galMulSlicesNeonXor
+func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
+	n := (stop - start)
+
+	switch len(out) {
+	case 1:
+		mulNeon_10x1_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 2:
+		mulNeon_10x2_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 3:
+		mulNeon_10x3_64Xor(matrix, in, out, start, n)
+		return n & (maxInt - 63)
+	case 4:
+		mulNeon_10x4Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 5:
+		mulNeon_10x5Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 6:
+		mulNeon_10x6Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 7:
+		mulNeon_10x7Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 8:
+		mulNeon_10x8Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 9:
+		mulNeon_10x9Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	case 10:
+		mulNeon_10x10Xor(matrix, in, out, start, n)
+		return n & (maxInt - 31)
+	}
+	panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out)))
+}
+
+func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go
index 1ba08b5e..d23bca18 100644
--- a/galois_gen_switch_nopshufb_amd64.go
+++ b/galois_gen_switch_nopshufb_amd64.go
@@ -10,14 +10,15 @@ import (
 )
 
 const (
-	avx2CodeGen    = true
-	maxAvx2Inputs  = 10
-	maxAvx2Outputs = 10
-	minAvx2Size    = 64
+	codeGen    = true
+	codeGenMaxInputs  = 10
+	codeGenMinOutputs = 10
+	minCodeGenSize    = 64
 )
 
-func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int    { panic(`no pshufb`) }
-func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) }
+func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) {
+	return nil, nil, false // no code generation for generic case (only GFNI cases)
+}
 
 func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (64 - 1))
@@ -1370,3 +1371,19 @@ func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int)
 	}
 	panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
 }
+
+func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
+
+func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
+	panic("codegen not available")
+}
diff --git a/galois_test.go b/galois_test.go
index d0aea694..580b216c 100644
--- a/galois_test.go
+++ b/galois_test.go
@@ -270,7 +270,7 @@ func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f f
 		}
 	}
 
-	m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), nil)
+	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil)
 
 	end := start + f(m, inputs, outputs, start, stop)
 	if end != stop {
@@ -327,7 +327,7 @@ func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int,
 		}
 	}
 
-	m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), nil)
+	m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil)
 
 	end := start + f(m, inputs, outputs, start, stop)
 	if end != stop {
@@ -363,54 +363,42 @@ func testGenGaloisEarlyAbort(t *testing.T, matrixRows [][]byte, size int, f func
 	}
 }
 
-func TestGenGalois(t *testing.T) {
+func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int) {
 
-	testUpto10x10 := func(f func(matrix []byte, in, out [][]byte, start, stop int) int,
-		fXor func(matrix []byte, in, out [][]byte, start, stop int) int) {
-
-		for output := 1; output <= 10; output++ {
-			for input := 1; input <= 10; input++ {
-				matrixRows := make([][]byte, input)
-				for i := range matrixRows {
-					matrixRows[i] = make([]byte, output)
-					for j := range matrixRows[i] {
-						matrixRows[i][j] = byte(mathrand.Intn(16))
-					}
+	for output := 1; output <= codeGenMaxOutputs; output++ {
+		for input := 1; input <= codeGenMaxInputs; input++ {
+			matrixRows := make([][]byte, input)
+			for i := range matrixRows {
+				matrixRows[i] = make([]byte, output)
+				for j := range matrixRows[i] {
+					matrixRows[i][j] = byte(mathrand.Intn(16))
 				}
+			}
 
-				size, stepsize := 32, 32
-				if input <= 3 {
-					size, stepsize = 64, 64 // 3x? are all _64 versions
-				}
+			size, stepsize := 32, 32
+			if input <= 3 {
+				size, stepsize = 64, 64 // 3x? are all _64 versions
+			}
 
-				// test early abort
-				testGenGaloisEarlyAbort(t, matrixRows, size-1, f)
-				testGenGaloisEarlyAbort(t, matrixRows, size-1, fXor)
-				const limit = 1024
-				for ; size < limit; size += stepsize {
-					// test full range
-					testGenGalois(t, matrixRows, size, 0, size, f)
-					testGenGaloisXor(t, matrixRows, size, 0, size, fXor)
-
-					if size >= stepsize*2 && size < limit-stepsize*2 {
-						start := stepsize
-						stop := size - start
-						// test partial range
-						testGenGalois(t, matrixRows, size, start, stop, f)
-						testGenGaloisXor(t, matrixRows, size, start, stop, fXor)
-					}
+			// test early abort
+			testGenGaloisEarlyAbort(t, matrixRows, size-1, f)
+			testGenGaloisEarlyAbort(t, matrixRows, size-1, fXor)
+			const limit = 1024
+			for ; size < limit; size += stepsize {
+				// test full range
+				testGenGalois(t, matrixRows, size, 0, size, f)
+				testGenGaloisXor(t, matrixRows, size, 0, size, fXor)
+
+				if size >= stepsize*2 && size < limit-stepsize*2 {
+					start := stepsize
+					stop := size - start
+					// test partial range
+					testGenGalois(t, matrixRows, size, start, stop, f)
+					testGenGaloisXor(t, matrixRows, size, start, stop, fXor)
 				}
 			}
 		}
 	}
-
-	testSVE, testNEON := false, true
-	if testSVE {
-		testUpto10x10(galMulSlicesSve, galMulSlicesSveXor)
-	}
-	if testNEON {
-		testUpto10x10(galMulSlicesNeon, galMulSlicesNeonXor)
-	}
 }
 
 func benchmarkGalois(b *testing.B, size int) {
diff --git a/options.go b/options.go
index 73cc7d6d..377137ef 100644
--- a/options.go
+++ b/options.go
@@ -21,7 +21,9 @@ type options struct {
 	useAVX512,
 	useAVX2,
 	useSSSE3,
-	useSSE2 bool
+	useSSE2,
+	useNEON,
+	useSVE bool
 
 	useJerasureMatrix    bool
 	usePAR1Matrix        bool
@@ -51,6 +53,8 @@ var defaultOptions = options{
 	useAVX512:     cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL),
 	useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ),
 	useAvxGNFI:    cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI),
+	useNEON:       cpuid.CPU.Supports(cpuid.ASIMD),
+	useSVE:        cpuid.CPU.Supports(cpuid.SVE),
 }
 
 // leopardMode controls the use of leopard GF in encoding and decoding.
@@ -316,6 +320,11 @@ func (o *options) cpuOptions() string {
 	if o.useAvxGNFI {
 		res = append(res, "AVX+GFNI")
 	}
+	if o.useSVE {
+		res = append(res, "ARM+SVE")
+	} else if o.useNEON {
+		res = append(res, "ARM+NEON")
+	}
 	if len(res) == 0 {
 		return "pure Go"
 	}
diff --git a/reedsolomon.go b/reedsolomon.go
index bebba044..3a8d7d30 100644
--- a/reedsolomon.go
+++ b/reedsolomon.go
@@ -153,9 +153,8 @@ type Extensions interface {
 }
 
 const (
-	avx2CodeGenMinSize       = 64
-	avx2CodeGenMinShards     = 3
-	avx2CodeGenMaxGoroutines = 8
+	codeGenMinSize           = 64
+	codeGenMinShards         = 3
 	gfniCodeGenMaxGoroutines = 4
 
 	intSize = 32 << (^uint(0) >> 63) // 32 or 64
@@ -482,21 +481,23 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 		r.o.perRound = 128 << 10
 	}
 
+	_, _, useCodeGen := r.hasCodeGen(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs)
+
 	divide := parityShards + 1
-	if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
+	if codeGen && useCodeGen && (dataShards > codeGenMaxInputs || parityShards > codeGenMaxOutputs) {
 		// Base on L1 cache if we have many inputs.
 		r.o.perRound = cpuid.CPU.Cache.L1D
 		if r.o.perRound < 32<<10 {
 			r.o.perRound = 32 << 10
 		}
 		divide = 0
-		if dataShards > maxAvx2Inputs {
-			divide += maxAvx2Inputs
+		if dataShards > codeGenMaxInputs {
+			divide += codeGenMaxInputs
 		} else {
 			divide += dataShards
 		}
-		if parityShards > maxAvx2Inputs {
-			divide += maxAvx2Outputs
+		if parityShards > codeGenMaxInputs {
+			divide += codeGenMaxOutputs
 		} else {
 			divide += parityShards
 		}
@@ -555,11 +556,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 
 	// Generated AVX2 does not need data to stay in L1 cache between runs.
 	// We will be purely limited by RAM speed.
-	if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
-		r.o.maxGoroutines = avx2CodeGenMaxGoroutines
+	if useCodeGen && r.o.maxGoroutines > codeGenMaxGoroutines {
+		r.o.maxGoroutines = codeGenMaxGoroutines
 	}
 
-	if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines {
+	if r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines {
 		r.o.maxGoroutines = gfniCodeGenMaxGoroutines
 	}
 
@@ -577,7 +578,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 		r.parity[i] = r.m[dataShards+i]
 	}
 
-	if avx2CodeGen && r.o.useAVX2 {
+	if codeGen && useCodeGen {
 		sz := r.dataShards * r.parityShards * 2 * 32
 		r.mPool.New = func() interface{} {
 			return AllocAligned(1, sz)[0]
@@ -653,7 +654,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro
 		return ErrShardSize
 	}
 
-	if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) {
+	if codeGen && len(dataShard) >= r.o.perRound && len(parity) >= codeGenMinShards && (pshufb || r.o.useAvx512GFNI || r.o.useAvxGNFI) {
 		m := make([][]byte, r.parityShards)
 		for iRow := range m {
 			m[iRow] = r.parity[iRow][idx : idx+1]
@@ -661,7 +662,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro
 		if r.o.useAvx512GFNI || r.o.useAvxGNFI {
 			r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false)
 		} else {
-			r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false)
+			r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil)
 		}
 		return nil
 	}
@@ -803,16 +804,10 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
 	return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil
 }
 
-func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
-	return avx2CodeGen && pshufb && r.o.useAVX2 &&
-		byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards &&
-		inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs
-}
-
 func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool {
-	return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) &&
-		byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards &&
-		inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs
+	return codeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) &&
+		byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards &&
+		inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
 }
 
 // Multiplies a subset of rows from a coding matrix by a full set of
@@ -839,7 +834,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 		end = len(inputs[0])
 	}
 	if r.canGFNI(byteCount, len(inputs), len(outputs)) {
-		var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64
+		var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64
 		m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:])
 		if r.o.useAvx512GFNI {
 			start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount)
@@ -847,13 +842,13 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 			start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount)
 		}
 		end = len(inputs[0])
-	} else if r.canAVX2C(byteCount, len(inputs), len(outputs)) {
-		m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
-		start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
+	} else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok {
+		m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+		start += (*galMulGen)(m, inputs, outputs, 0, byteCount)
 		r.putTmpSlice(m)
 		end = len(inputs[0])
-	} else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) {
-		var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64
+	} else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount, codeGenMaxInputs, codeGenMaxOutputs); len(inputs)+len(outputs) > codeGenMinShards && ok {
+		var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64
 		end = len(inputs[0])
 		inIdx := 0
 		m := r.getTmpSlice()
@@ -861,15 +856,15 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 		ins := inputs
 		for len(ins) > 0 {
 			inPer := ins
-			if len(inPer) > maxAvx2Inputs {
-				inPer = inPer[:maxAvx2Inputs]
+			if len(inPer) > codeGenMaxInputs {
+				inPer = inPer[:codeGenMaxInputs]
 			}
 			outs := outputs
 			outIdx := 0
 			for len(outs) > 0 {
 				outPer := outs
-				if len(outPer) > maxAvx2Outputs {
-					outPer = outPer[:maxAvx2Outputs]
+				if len(outPer) > codeGenMaxOutputs {
+					outPer = outPer[:codeGenMaxOutputs]
 				}
 				if r.o.useAvx512GFNI {
 					m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:])
@@ -886,11 +881,11 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 						start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount)
 					}
 				} else {
-					m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
+					m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
 					if inIdx == 0 {
-						start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount)
+						start = (*galMulGen)(m, inPer, outPer, 0, byteCount)
 					} else {
-						start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount)
+						start = (*galMulGenXor)(m, inPer, outPer, 0, byteCount)
 					}
 				}
 				outIdx += len(outPer)
@@ -928,27 +923,27 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 	var wg sync.WaitGroup
 	gor := r.o.maxGoroutines
 
-	var avx2Matrix []byte
+	var genMatrix []byte
 	var gfniMatrix []uint64
-	useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
+	galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs))
 	useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs))
 	if useGFNI {
-		var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64
+		var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64
 		gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:])
-	} else if useAvx2 {
-		avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
-		defer r.putTmpSlice(avx2Matrix)
-	} else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
-		r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
+	} else if useCodeGen {
+		genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+		defer r.putTmpSlice(genMatrix)
+	} else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards &&
+		r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs) {
 		// It appears there is a switchover point at around 10MB where
 		// Regular processing is faster...
 		r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true)
 		return
-	} else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
-		r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
+	} else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok &&
+		byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards {
 		// It appears there is a switchover point at around 10MB where
 		// Regular processing is faster...
-		r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true)
+		r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true, galMulGen, galMulGenXor)
 		return
 	}
 
@@ -965,8 +960,8 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 				} else {
 					start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop)
 				}
-			} else if useAvx2 {
-				start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
+			} else if useCodeGen {
+				start += (*galMulGen)(genMatrix, inputs, outputs, start, stop)
 			}
 		}
 
@@ -1017,7 +1012,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 // Perform the same as codeSomeShards, but split the workload into
 // several goroutines.
 // If clear is set, the first write will overwrite the output.
-func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
+func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGen, galMulGenXor *func(matrix []byte, in [][]byte, out [][]byte, start int, stop int) int) {
 	var wg sync.WaitGroup
 	gor := r.o.maxGoroutines
 
@@ -1028,7 +1023,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 		first  bool
 	}
 	// Make a plan...
-	plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
+	plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs))
 
 	tmp := r.getTmpSlice()
 	defer r.putTmpSlice(tmp)
@@ -1040,18 +1035,18 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 		ins := inputs
 		for len(ins) > 0 {
 			inPer := ins
-			if len(inPer) > maxAvx2Inputs {
-				inPer = inPer[:maxAvx2Inputs]
+			if len(inPer) > codeGenMaxInputs {
+				inPer = inPer[:codeGenMaxInputs]
 			}
 			outs := outputs
 			outIdx := 0
 			for len(outs) > 0 {
 				outPer := outs
-				if len(outPer) > maxAvx2Outputs {
-					outPer = outPer[:maxAvx2Outputs]
+				if len(outPer) > codeGenMaxOutputs {
+					outPer = outPer[:codeGenMaxOutputs]
 				}
 				// Generate local matrix
-				m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
 				tmp = tmp[len(m):]
 				plan = append(plan, state{
 					input:  inPer,
@@ -1070,19 +1065,19 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 		outIdx := 0
 		for len(outs) > 0 {
 			outPer := outs
-			if len(outPer) > maxAvx2Outputs {
-				outPer = outPer[:maxAvx2Outputs]
+			if len(outPer) > codeGenMaxOutputs {
+				outPer = outPer[:codeGenMaxOutputs]
 			}
 
 			inIdx := 0
 			ins := inputs
 			for len(ins) > 0 {
 				inPer := ins
-				if len(inPer) > maxAvx2Inputs {
-					inPer = inPer[:maxAvx2Inputs]
+				if len(inPer) > codeGenMaxInputs {
+					inPer = inPer[:codeGenMaxInputs]
 				}
 				// Generate local matrix
-				m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
 				tmp = tmp[len(m):]
 				//fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
 				plan = append(plan, state{
@@ -1111,14 +1106,14 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 			lstop = stop
 		}
 		for lstart < stop {
-			if lstop-lstart >= minAvx2Size {
+			if galMulGen != nil && galMulGenXor != nil && lstop-lstart >= minCodeGenSize {
 				// Execute plan...
 				var n int
 				for _, p := range plan {
 					if p.first {
-						n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop)
+						n = (*galMulGen)(p.m, p.input, p.output, lstart, lstop)
 					} else {
-						n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop)
+						n = (*galMulGenXor)(p.m, p.input, p.output, lstart, lstop)
 					}
 				}
 				lstart += n
@@ -1183,7 +1178,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 		first  bool
 	}
 	// Make a plan...
-	plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
+	plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs))
 
 	// Flips between input first to output first.
 	// We put the smallest data load in the inner loop.
@@ -1192,15 +1187,15 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 		ins := inputs
 		for len(ins) > 0 {
 			inPer := ins
-			if len(inPer) > maxAvx2Inputs {
-				inPer = inPer[:maxAvx2Inputs]
+			if len(inPer) > codeGenMaxInputs {
+				inPer = inPer[:codeGenMaxInputs]
 			}
 			outs := outputs
 			outIdx := 0
 			for len(outs) > 0 {
 				outPer := outs
-				if len(outPer) > maxAvx2Outputs {
-					outPer = outPer[:maxAvx2Outputs]
+				if len(outPer) > codeGenMaxOutputs {
+					outPer = outPer[:codeGenMaxOutputs]
 				}
 				// Generate local matrix
 				m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
@@ -1221,16 +1216,16 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 		outIdx := 0
 		for len(outs) > 0 {
 			outPer := outs
-			if len(outPer) > maxAvx2Outputs {
-				outPer = outPer[:maxAvx2Outputs]
+			if len(outPer) > codeGenMaxOutputs {
+				outPer = outPer[:codeGenMaxOutputs]
 			}
 
 			inIdx := 0
 			ins := inputs
 			for len(ins) > 0 {
 				inPer := ins
-				if len(inPer) > maxAvx2Inputs {
-					inPer = inPer[:maxAvx2Inputs]
+				if len(inPer) > codeGenMaxInputs {
+					inPer = inPer[:codeGenMaxInputs]
 				}
 				// Generate local matrix
 				m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer)))
@@ -1261,7 +1256,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 			lstop = stop
 		}
 		for lstart < stop {
-			if lstop-lstart >= minAvx2Size {
+			if lstop-lstart >= minCodeGenSize {
 				// Execute plan...
 				var n int
 				if r.o.useAvx512GFNI {