diff --git a/report/report.md b/report/report.md
index 13e2e65..8099dc2 100644
--- a/report/report.md
+++ b/report/report.md
@@ -14,6 +14,7 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia.
 
   - Ours
     - default: 0.5443052357800366
+    - st: 0.5443052357800366
     - dmse: 0.5418910269181569
     - bsbs: 0.5435087256676912
     - mae: 0.5374008633412148
@@ -21,14 +22,15 @@ Sources used: wikimedia.i_love_you_california, wikimedia.winter_kiss, wikimedia.
 
 ### Average compression speed (inverse RTF)
   - Reference
-    - opt8lax: 259.5363983239151
-    - opt8: 255.04180944185632
-    - opt5: 480.08948488664015
+    - opt8lax: 258.92532958937727
+    - opt8: 262.24392534874113
+    - opt5: 502.5549809373441
 
   - Ours
-    - default: 147.73177741587608
-    - dmse: 110.59936109083178
-    - bsbs: 6.6560855133273495
-    - mae: 32.21428513358941
+    - default: 149.36375631265756
+    - st: 57.107646528588624
+    - dmse: 110.75909016976695
+    - bsbs: 7.152928413350077
+    - mae: 31.28330510072386
 
 
diff --git a/src/lib.rs b/src/lib.rs
index 7b94b40..13ebeb5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -71,18 +71,18 @@ mod test {
 
     const FIXED_BLOCK_CONFIGS: [&str; 4] = [
         "",
-        r#"
+        r"
 block_sizes = [512]
-        "#,
-        r#"
+        ",
+        r"
 block_sizes = [123]
-        "#,
-        r#"
+        ",
+        r"
 block_sizes = [1024]
 [subframe_coding.qlpc]
 use_direct_mse = true
 mae_optimization_steps = 2
-        "#,
+        ",
     ];
 
     #[rstest]
diff --git a/src/rice.rs b/src/rice.rs
index 3ed59cb..445af0b 100644
--- a/src/rice.rs
+++ b/src/rice.rs
@@ -15,7 +15,6 @@
 //! Functions for partitioned rice coding (PRC).
 
 use std::cell::RefCell;
-use std::simd::SimdOrd;
 use std::simd::SimdPartialEq;
 use std::simd::SimdPartialOrd;
 use std::simd::SimdUint;
@@ -41,6 +40,8 @@ static MAXES: std::simd::u32x16 = std::simd::u32x16::from_array([u32::MAX; 16]);
 
 // max value of p_to_bits is chosen so that the estimates doesn't overflow
 // after added 2^4 = 16 times at maximum.
+// The current version exploits the fact that `MAX_P_TO_BITS` is actually a bit mask, i.e.
+// can be written as 2^N - 1 for faster processing. Do not use arbitrary value here.
 static MAX_P_TO_BITS: u32 = (1 << 28) - 1;
 static MAX_P_TO_BITS_VEC: std::simd::u32x16 = std::simd::u32x16::from_array([MAX_P_TO_BITS; 16]);
 
@@ -68,26 +69,30 @@ impl PrcBitTable {
         for v in errors {
             // Below is faster than doing:
             //   vs = splat(*v) >> INDEX;
+            // or
+            //   vs = std::simd::u32x16::from_array(std::array::from_fn(
+            //       |i| v >> i));
             // Perhaps due to smaller memory footprint by avoiding `splat`?
+            let v = *v;
             let vs = std::simd::u32x16::from_array([
-                *v,
-                *v >> 1,
-                *v >> 2,
-                *v >> 3,
-                *v >> 4,
-                *v >> 5,
-                *v >> 6,
-                *v >> 7,
-                *v >> 8,
-                *v >> 9,
-                *v >> 10,
-                *v >> 11,
-                *v >> 12,
-                *v >> 13,
-                *v >> 14,
-                *v >> 15,
+                v,
+                v >> 1,
+                v >> 2,
+                v >> 3,
+                v >> 4,
+                v >> 5,
+                v >> 6,
+                v >> 7,
+                v >> 8,
+                v >> 9,
+                v >> 10,
+                v >> 11,
+                v >> 12,
+                v >> 13,
+                v >> 14,
+                v >> 15,
             ]);
-            p_to_bits = (vs + p_to_bits).simd_min(MAX_P_TO_BITS_VEC);
+            p_to_bits = (vs + p_to_bits) & MAX_P_TO_BITS_VEC;
         }
         self.p_to_bits = p_to_bits;
     }