huggingface · mert-kurttutan · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Feb 1, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -54,6 +54,7 @@ imageproc = { version = "0.24.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
+mathfun = "0.2.1"
 memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
 num_cpus = "1.15.0"
 num-traits = "0.2.15"

diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -21,6 +21,7 @@ half = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 libc = { workspace = true, optional = true }
 memmap2 = { workspace = true }
+mathfun = { workspace = true }
 num-traits = { workspace = true }
 num_cpus = { workspace = true }
 rand = { workspace = true }

diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs
@@ -70,6 +70,8 @@ pub mod layout;
 pub mod metal_backend;
 #[cfg(feature = "mkl")]
 mod mkl;
+#[cfg(not(all(feature = "mkl", feature = "accelerate")))]
+mod mathfun;
 pub mod npy;
 pub mod op;
 pub mod pickle;

diff --git a/candle-core/src/mathfun.rs b/candle-core/src/mathfun.rs
@@ -0,0 +1,187 @@
+
+
+
+#[inline]
+pub fn vs_tanh_inplace(y: &mut [f32]) {
+    unsafe { mathfun::vs_tanh(y.len(), y.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_gelu(vs: &[f32], ys: &mut [f32]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = (2.0f32 / std::f32::consts::PI).sqrt() * v * (1.0 + 0.044715 * v * v)
+    }
+    vs_tanh_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = 0.5 * v * (1.0 + *y)
+    }
+}
+
+
+#[inline]
+pub fn vs_exp_inplace(y: &mut [f32]) {
+    unsafe { mathfun::vs_exp(y.len(), y.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_silu(vs: &[f32], ys: &mut [f32]) {
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = -v
+    }
+    vs_exp_inplace(ys);
+    for (&v, y) in vs.iter().zip(ys.iter_mut()) {
+        *y = v / (1.0 + *y)
+    }
+}
+
+#[inline]
+pub fn vs_tanh(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_tanh(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+
+#[inline]
+pub fn vs_sin(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_sin(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_exp(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_exp(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_cos(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_cos(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+
+#[inline]
+pub fn vs_ln(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_ln(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_sqrt(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    unsafe { mathfun::vs_sqrt(a_len, a.as_ptr(), y.as_mut_ptr()) }
+}
+
+#[inline]
+pub fn vs_add(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i] + b[i]
+    }
+}
+
+#[inline]
+pub fn vs_div(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i] / b[i]
+    }
+}
+
+#[inline]
+pub fn vs_sub(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i] - b[i]
+    }
+}
+
+#[inline]
+pub fn vs_mul(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i] * b[i]
+    }
+}
+
+#[inline]
+pub fn vs_min(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i].min(b[i])
+    }
+}
+
+#[inline]
+pub fn vs_max(a: &[f32], b: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let b_len = b.len();
+    let y_len = y.len();
+    if a_len != b_len || a_len != y_len {
+        panic!("a, b, and y have different lengths {a_len} <> {b_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i].max(b[i])
+    }
+}
+
+#[inline]
+pub fn vs_sqr(a: &[f32], y: &mut [f32]) {
+    let a_len = a.len();
+    let y_len = y.len();
+    if a_len != y_len {
+        panic!("a and y have different lengths {a_len} <> {y_len}")
+    }
+    for i in 0..a_len {
+        y[i] = a[i] * a[i]
+    }
+}
diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
@@ -294,7 +294,6 @@ macro_rules! bin_op {
                 $e(v1, v2)
             }
 
-            #[cfg(feature = "mkl")]
             const F32_VEC: bool = true;
             #[cfg(feature = "mkl")]
             const F64_VEC: bool = true;
@@ -309,8 +308,6 @@ macro_rules! bin_op {
                 crate::mkl::$f64_vec(xs1, xs2, ys)
             }
 
-            #[cfg(feature = "accelerate")]
-            const F32_VEC: bool = true;
             #[cfg(feature = "accelerate")]
             const F64_VEC: bool = true;
             #[cfg(feature = "accelerate")]
@@ -323,6 +320,12 @@ macro_rules! bin_op {
             fn f64_vec(xs1: &[f64], xs2: &[f64], ys: &mut [f64]) {
                 crate::accelerate::$f64_vec(xs1, xs2, ys)
             }
+
+            #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+            #[inline(always)]
+            fn f32_vec(xs1: &[f32], xs2: &[f32], ys: &mut [f32]) {
+                crate::mathfun::$f32_vec(xs1, xs2, ys)
+            }
         }
     };
 }
@@ -418,7 +421,6 @@ macro_rules! unary_op {
                 todo!("no unary function for i64")
             }
 
-            #[cfg(feature = "mkl")]
             const F32_VEC: bool = true;
             #[cfg(feature = "mkl")]
             const F64_VEC: bool = true;
@@ -433,8 +435,6 @@ macro_rules! unary_op {
                 crate::mkl::$f64_vec(xs, ys)
             }
 
-            #[cfg(feature = "accelerate")]
-            const F32_VEC: bool = true;
             #[cfg(feature = "accelerate")]
             const F64_VEC: bool = true;
             #[cfg(feature = "accelerate")]
@@ -447,6 +447,12 @@ macro_rules! unary_op {
             fn f64_vec(xs: &[f64], ys: &mut [f64]) {
                 crate::accelerate::$f64_vec(xs, ys)
             }
+
+            #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+            #[inline(always)]
+            fn f32_vec(xs: &[f32], ys: &mut [f32]) {
+                crate::mathfun::$f32_vec(xs, ys)
+            }
         }
     };
 }
@@ -518,7 +524,6 @@ impl UnaryOpT for Gelu {
     }
     const KERNEL: &'static str = "ugelu";
 
-    #[cfg(feature = "mkl")]
     const F32_VEC: bool = true;
 
     #[cfg(feature = "mkl")]
@@ -536,9 +541,6 @@ impl UnaryOpT for Gelu {
         crate::mkl::vd_gelu(xs, ys)
     }
 
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
     #[cfg(feature = "accelerate")]
     #[inline(always)]
     fn f32_vec(xs: &[f32], ys: &mut [f32]) {
@@ -553,6 +555,12 @@ impl UnaryOpT for Gelu {
     fn f64_vec(xs: &[f64], ys: &mut [f64]) {
         crate::accelerate::vd_gelu(xs, ys)
     }
+
+    #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+    #[inline(always)]
+    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
+        crate::mathfun::vs_gelu(xs, ys)
+    }
 }
 
 /// `erf` operation
@@ -625,7 +633,6 @@ impl UnaryOpT for Silu {
     }
     const KERNEL: &'static str = "usilu";
 
-    #[cfg(feature = "mkl")]
     const F32_VEC: bool = true;
 
     #[cfg(feature = "mkl")]
@@ -643,9 +650,6 @@ impl UnaryOpT for Silu {
         crate::mkl::vd_silu(xs, ys)
     }
 
-    #[cfg(feature = "accelerate")]
-    const F32_VEC: bool = true;
-
     #[cfg(feature = "accelerate")]
     #[inline(always)]
     fn f32_vec(xs: &[f32], ys: &mut [f32]) {
@@ -660,6 +664,12 @@ impl UnaryOpT for Silu {
     fn f64_vec(xs: &[f64], ys: &mut [f64]) {
         crate::accelerate::vd_silu(xs, ys)
     }
+
+    #[cfg(all(not(feature = "mkl"), not(feature = "accelerate")))]
+    #[inline(always)]
+    fn f32_vec(xs: &[f32], ys: &mut [f32]) {
+        crate::mathfun::vs_silu(xs, ys)
+    }
 }
 
 impl UnaryOpT for Abs {