diff --git a/reedsolomon.go b/reedsolomon.go
index 9775b10a..20e39748 100644
--- a/reedsolomon.go
+++ b/reedsolomon.go
@@ -477,11 +477,17 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 
 	// Calculate what we want per round
 	r.o.perRound = cpuid.CPU.Cache.L2
+	if r.o.perRound < 128<<10 {
+		r.o.perRound = 128 << 10
+	}
 
 	divide := parityShards + 1
 	if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
 		// Base on L1 cache if we have many inputs.
 		r.o.perRound = cpuid.CPU.Cache.L1D
+		if r.o.perRound < 32<<10 {
+			r.o.perRound = 32 << 10
+		}
 		divide = 0
 		if dataShards > maxAvx2Inputs {
 			divide += maxAvx2Inputs
@@ -495,11 +501,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 		}
 	}
 
-	if r.o.perRound <= 0 {
-		// Set to 128K if undetectable.
-		r.o.perRound = 128 << 10
-	}
-
 	if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores {
 		// If multiple threads per core, make sure they don't contend for cache.
 		r.o.perRound /= cpuid.CPU.ThreadsPerCore
@@ -510,6 +511,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 	// Align to 64 bytes.
 	r.o.perRound = ((r.o.perRound + 63) / 64) * 64
 
+	// Final sanity check...
+	if r.o.perRound < 1<<10 {
+		r.o.perRound = 1 << 10
+	}
+
 	if r.o.minSplitSize <= 0 {
 		// Set minsplit as high as we can, but still have parity in L1.
 		cacheSize := cpuid.CPU.Cache.L1D
@@ -646,6 +652,19 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro
 		return ErrShardSize
 	}
 
+	if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && (r.o.useAVX2 || r.o.useGFNI) {
+		m := make([][]byte, r.parityShards)
+		for iRow := range m {
+			m[iRow] = r.parity[iRow][idx : idx+1]
+		}
+		if r.o.useGFNI {
+			r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false)
+		} else {
+			r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false)
+		}
+		return nil
+	}
+
 	// Process using no goroutines for now.
 	start, end := 0, r.o.perRound
 	if end > len(dataShard) {
@@ -909,16 +928,16 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 		avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
 		defer r.putTmpSlice(avx2Matrix)
 	} else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
-		r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
+		r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
 		// It appears there is a switchover point at around 10MB where
 		// Regular processing is faster...
-		r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
+		r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true)
 		return
 	} else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
 		r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
 		// It appears there is a switchover point at around 10MB where
 		// Regular processing is faster...
-		r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
+		r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true)
 		return
 	}
 
@@ -982,7 +1001,8 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 
 // Perform the same as codeSomeShards, but split the workload into
 // several goroutines.
-func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) {
+// If clear is set, the first write will overwrite the output.
+func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
 	var wg sync.WaitGroup
 	gor := r.o.maxGoroutines
 
@@ -1022,7 +1042,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 					input:  inPer,
 					output: outPer,
 					m:      m,
-					first:  inIdx == 0,
+					first:  inIdx == 0 && clear,
 				})
 				outIdx += len(outPer)
 				outs = outs[len(outPer):]
@@ -1054,7 +1074,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 					input:  inPer,
 					output: outPer,
 					m:      m,
-					first:  inIdx == 0,
+					first:  inIdx == 0 && clear,
 				})
 				inIdx += len(inPer)
 				ins = ins[len(inPer):]
@@ -1070,6 +1090,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 	}
 
 	exec := func(start, stop int) {
+		defer wg.Done()
 		lstart, lstop := start, start+r.o.perRound
 		if lstop > stop {
 			lstop = stop
@@ -1097,7 +1118,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 			for c := range inputs {
 				in := inputs[c][lstart:lstop]
 				for iRow := 0; iRow < len(outputs); iRow++ {
-					if c == 0 {
+					if c == 0 && clear {
 						galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
 					} else {
 						galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
@@ -1110,7 +1131,6 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 				lstop = stop
 			}
 		}
-		wg.Done()
 	}
 	if gor == 1 {
 		wg.Add(1)
@@ -1135,7 +1155,8 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 
 // Perform the same as codeSomeShards, but split the workload into
 // several goroutines.
-func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int) {
+// If clear is set, the first write will overwrite the output.
+func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) {
 	var wg sync.WaitGroup
 	gor := r.o.maxGoroutines
 
@@ -1171,7 +1192,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 					input:  inPer,
 					output: outPer,
 					m:      m,
-					first:  inIdx == 0,
+					first:  inIdx == 0 && clear,
 				})
 				outIdx += len(outPer)
 				outs = outs[len(outPer):]
@@ -1202,7 +1223,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 					input:  inPer,
 					output: outPer,
 					m:      m,
-					first:  inIdx == 0,
+					first:  inIdx == 0 && clear,
 				})
 				inIdx += len(inPer)
 				ins = ins[len(inPer):]
@@ -1218,6 +1239,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 	}
 
 	exec := func(start, stop int) {
+		defer wg.Done()
 		lstart, lstop := start, start+r.o.perRound
 		if lstop > stop {
 			lstop = stop
@@ -1245,7 +1267,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 			for c := range inputs {
 				in := inputs[c][lstart:lstop]
 				for iRow := 0; iRow < len(outputs); iRow++ {
-					if c == 0 {
+					if c == 0 && clear {
 						galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
 					} else {
 						galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
@@ -1258,8 +1280,8 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b
 				lstop = stop
 			}
 		}
-		wg.Done()
 	}
+
 	if gor == 1 {
 		wg.Add(1)
 		exec(0, byteCount)