Attempt using portable_simd

It's actually no faster than the autovectorized version?
Beinsezii · Jun 3, 2024 · 138a072 · 138a072
1 parent 37882b9
commit 138a072
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 85 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,6 +13,10 @@ publish = true
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
+[features]
+nightly = []
+default = []
+
 [dependencies]
 
 [dev-dependencies]

diff --git a/benches/conversions.rs b/benches/conversions.rs
@@ -1,3 +1,4 @@
+#![feature(portable_simd)]
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use colcon::{Space, convert_space};
 
@@ -73,12 +74,11 @@ pub fn conversions(c: &mut Criterion) {
     } ));
 
     c.bench_function("srgb_eotf", |b| b.iter(|| {
-        const N: usize = 16;
-        black_box(pixels.clone().chunks_exact_mut(N).for_each(|simd| {
-            let simd: &mut [f32; N] = simd.try_into().unwrap();
-            *simd = colcon::srgb_eotf(*simd);
-        }));
-        //black_box(pixels.clone().iter_mut().for_each(|n| *n = colcon::srgb_eotf(*n)));
+        black_box(pixels.clone().iter_mut().for_each(|n| *n = colcon::srgb_eotf(*n)));
+    } ));
+
+    c.bench_function("srgb_eotf_simd", |b| b.iter(|| {
+        black_box(pixels.clone().as_simd_mut::<32>().1.iter_mut().for_each(|n| *n = colcon::srgb_eotf(*n)));
     } ));
 
     c.bench_function("srgb_eotf_inverse", |b| b.iter(|| {

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,4 @@
+#![cfg_attr(feature = "nightly", feature(portable_simd))]
 #![warn(missing_docs)]
 
 //! Simple colorspace conversions in pure Rust.
@@ -9,8 +10,13 @@
 //! This crate references CIE Standard Illuminant D65 for functions to/from CIE XYZ
 
 use core::ffi::{c_char, CStr};
-//use core::cmp::PartialOrd;
-//use core::ops::{Add, Div, Mul, Rem, Sub};
+use core::ops::{Add, Div, Mul, Rem, Sub};
+
+#[cfg(feature = "nightly")]
+use std::simd::prelude::*;
+
+#[cfg(feature = "nightly")]
+use std::simd::{LaneCount, StdFloat, SupportedLaneCount};
 
 fn spowf(n: f32, power: f32) -> f32 {
     n.abs().powf(power).copysign(n)
@@ -19,16 +25,21 @@ fn spowf(n: f32, power: f32) -> f32 {
 enum Cmp {
     Gt,
     Lt,
-    GtEq,
-    LtEq,
+    Ge,
+    Le,
 }
 
-trait DType: Sized + Copy {
+trait DType:
+    Sized
+    + Copy
+    + Add<Output = Self>
+    + Div<Output = Self>
+    + Mul<Output = Self>
+    + Sub<Output = Self>
+    + Rem<Output = Self>
+{
     fn f32(b: f32) -> Self;
-    fn add(self, b: Self) -> Self;
-    fn sub(self, b: Self) -> Self;
-    fn div(self, b: Self) -> Self;
-    fn mul(self, b: Self) -> Self;
+    fn fma(self, mul: Self, add: Self) -> Self;
     fn powf(self, b: Self) -> Self;
     fn branch<F: FnOnce() -> Self, G: FnOnce() -> Self>(
         self,
@@ -44,20 +55,41 @@ impl DType for f32 {
         b
     }
 
-    fn add(self, b: Self) -> Self {
-        self + b
+    fn fma(self, mul: Self, add: Self) -> Self {
+        self.mul_add(mul, add)
+    }
+
+    fn powf(self, b: Self) -> Self {
+        self.powf(b)
     }
 
-    fn sub(self, b: Self) -> Self {
-        self - b
+    fn branch<F: FnOnce() -> Self, G: FnOnce() -> Self>(
+        self,
+        b: Self,
+        cmp: Cmp,
+        x: F,
+        y: G,
+    ) -> Self {
+        if match cmp {
+            Cmp::Gt => self > b,
+            Cmp::Lt => self < b,
+            Cmp::Ge => self >= b,
+            Cmp::Le => self <= b,
+        } {
+            x()
+        } else {
+            y()
+        }
     }
+}
 
-    fn div(self, b: Self) -> Self {
-        self / b
+impl DType for f64 {
+    fn f32(b: f32) -> Self {
+        b.into()
     }
 
-    fn mul(self, b: Self) -> Self {
-        self * b
+    fn fma(self, mul: Self, add: Self) -> Self {
+        self.mul_add(mul, add)
     }
 
     fn powf(self, b: Self) -> Self {
@@ -74,8 +106,8 @@ impl DType for f32 {
         if match cmp {
             Cmp::Gt => self > b,
             Cmp::Lt => self < b,
-            Cmp::GtEq => self >= b,
-            Cmp::LtEq => self <= b,
+            Cmp::Ge => self >= b,
+            Cmp::Le => self <= b,
         } {
             x()
         } else {
@@ -84,69 +116,41 @@ impl DType for f32 {
     }
 }
 
-impl<const N: usize> DType for [f32; N] {
+#[cfg(feature = "nightly")]
+impl<const N: usize> DType for Simd<f32, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
     fn f32(object: f32) -> Self {
-        [object; N]
-    }
-
-    fn add(mut self, b: Self) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .for_each(|(a, b)| *a = *a + b);
-        self
+        Self::splat(object)
     }
 
-    fn sub(mut self, b: Self) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .for_each(|(a, b)| *a = *a - b);
-        self
-    }
-
-    fn div(mut self, b: Self) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .for_each(|(a, b)| *a = *a / b);
-        self
-    }
-
-    fn mul(mut self, b: Self) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .for_each(|(a, b)| *a = *a * b);
-        self
+    fn fma(self, mul: Self, add: Self) -> Self {
+        self.mul_add(mul, add)
     }
 
     fn powf(mut self, b: Self) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .for_each(|(a, b)| *a = a.powf(b));
+        self.as_mut_array()
+            .iter_mut()
+            .zip(b.as_array().iter())
+            .for_each(|(a, b)| *a = a.powf(*b));
         self
     }
 
     fn branch<F: FnOnce() -> Self, G: FnOnce() -> Self>(
-        mut self,
+        self,
         b: Self,
         cmp: Cmp,
         x: F,
         y: G,
     ) -> Self {
-        self.iter_mut()
-            .zip(b.into_iter())
-            .zip(x().into_iter().zip(y().into_iter()))
-            .for_each(|((a, b), (x, y))| {
-                if match cmp {
-                    Cmp::Gt => *a > b,
-                    Cmp::Lt => *a < b,
-                    Cmp::GtEq => *a >= b,
-                    Cmp::LtEq => *a <= b,
-                } {
-                    *a = x
-                } else {
-                    *a = y
-                }
-            });
-        self
+        match cmp {
+            Cmp::Gt => self.simd_gt(b),
+            Cmp::Lt => self.simd_lt(b),
+            Cmp::Ge => self.simd_ge(b),
+            Cmp::Le => self.simd_le(b),
+        }
+        .select(x(), y())
     }
 }
 
@@ -300,29 +304,23 @@ fn matmul3(matrix: [[f32; 3]; 3], pixel: [f32; 3]) -> [f32; 3] {
 /// <https://en.wikipedia.org/wiki/SRGB#Computing_the_transfer_function>
 //#[no_mangle]
 //pub fn srgb_eotf<T: DType>(n: T) -> T {
-//    if n <= SRGBEOTF_CHI.into() {
-//        n / SRGBEOTF_PHI.into()
+//    if n <= SRGBEOTF_CHI {
+//        n / SRGBEOTF_PHI
 //    } else {
-//        ((n + SRGBEOTF_ALPHA.into()) / (SRGBEOTF_ALPHA + 1.0).into()).powf(SRGBEOTF_GAMMA.into())
+//        ((n + SRGBEOTF_ALPHA) / (SRGBEOTF_ALPHA + 1.0)).powf(SRGBEOTF_GAMMA)
 //    }
 //}
 
 pub fn srgb_eotf<T: DType>(n: T) -> T {
     n.branch(
         DType::f32(SRGBEOTF_CHI),
-        Cmp::LtEq,
-        || n.div(DType::f32(SRGBEOTF_PHI)),
+        Cmp::Le,
+        || n / DType::f32(SRGBEOTF_PHI),
         || {
-            n.add(DType::f32(SRGBEOTF_ALPHA))
-                .div(DType::f32(SRGBEOTF_ALPHA + 1.0))
+            ((n + DType::f32(SRGBEOTF_ALPHA)) / DType::f32(SRGBEOTF_ALPHA + 1.0))
                 .powf(DType::f32(SRGBEOTF_GAMMA))
         },
     )
-    //if n <= SRGBEOTF_CHI.into() {
-    //    n / SRGBEOTF_PHI.into()
-    //} else {
-    //    ((n + SRGBEOTF_ALPHA.into()) / (SRGBEOTF_ALPHA + 1.0).into()).powf(SRGBEOTF_GAMMA.into())
-    //}
 }
 
 /// Inverse sRGB Electro-Optical Transfer Function