From c580ef061c066dc31b34163312b23528a828f2a5 Mon Sep 17 00:00:00 2001
From: Tobias Decking <Tobias.Decking@gmail.com>
Date: Fri, 2 May 2025 09:20:13 +0200
Subject: [PATCH 1/3] Use rounding instructions on aarch64

---
 libm/src/math/arch/aarch64.rs | 198 ++++++++++++++++++++++++++++++++--
 libm/src/math/arch/mod.rs     |  15 +++
 libm/src/math/ceil.rs         |  16 ++-
 libm/src/math/floor.rs        |  16 ++-
 libm/src/math/round.rs        |  18 ++++
 libm/src/math/roundeven.rs    |  18 ++++
 libm/src/math/trunc.rs        |  16 ++-
 7 files changed, 280 insertions(+), 17 deletions(-)

diff --git a/libm/src/math/arch/aarch64.rs b/libm/src/math/arch/aarch64.rs
index 020bb731c..d099cc6ad 100644
--- a/libm/src/math/arch/aarch64.rs
+++ b/libm/src/math/arch/aarch64.rs
@@ -30,11 +30,156 @@ pub fn fmaf(mut x: f32, y: f32, z: f32) -> f32 {
     x
 }
 
+pub fn ceil(mut x: f64) -> f64 {
+    // SAFETY: `frintp` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintp {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn ceilf(mut x: f32) -> f32 {
+    // SAFETY: `frintp` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintp {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn ceilf16(mut x: f16) -> f16 {
+    // SAFETY: `frintp` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    unsafe {
+        asm!(
+            "frintp {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn floor(mut x: f64) -> f64 {
+    // SAFETY: `frintm` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintm {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn floorf(mut x: f32) -> f32 {
+    // SAFETY: `frintm` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintm {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn floorf16(mut x: f16) -> f16 {
+    // SAFETY: `frintm` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    unsafe {
+        asm!(
+            "frintm {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
 pub fn rint(mut x: f64) -> f64 {
+    // SAFETY: `frintx` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintx {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn rintf(mut x: f32) -> f32 {
+    // SAFETY: `frintx` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintx {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn rintf16(mut x: f16) -> f16 {
+    // SAFETY: `frintx` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    unsafe {
+        asm!(
+            "frintx {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn round(mut x: f64) -> f64 {
+    // SAFETY: `frinta` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frinta {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn roundf(mut x: f32) -> f32 {
+    // SAFETY: `frinta` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frinta {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn roundf16(mut x: f16) -> f16 {
+    // SAFETY: `frinta` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    unsafe {
+        asm!(
+            "frinta {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn roundeven(mut x: f64) -> f64 {
     // SAFETY: `frintn` is available with neon and has no side effects.
-    //
-    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
-    // not support rounding modes.
     unsafe {
         asm!(
             "frintn {x:d}, {x:d}",
@@ -45,11 +190,8 @@ pub fn rint(mut x: f64) -> f64 {
     x
 }
 
-pub fn rintf(mut x: f32) -> f32 {
+pub fn roundevenf(mut x: f32) -> f32 {
     // SAFETY: `frintn` is available with neon and has no side effects.
-    //
-    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
-    // not support rounding modes.
     unsafe {
         asm!(
             "frintn {x:s}, {x:s}",
@@ -61,11 +203,8 @@ pub fn rintf(mut x: f32) -> f32 {
 }
 
 #[cfg(all(f16_enabled, target_feature = "fp16"))]
-pub fn rintf16(mut x: f16) -> f16 {
+pub fn roundevenf16(mut x: f16) -> f16 {
     // SAFETY: `frintn` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
-    //
-    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
-    // not support rounding modes.
     unsafe {
         asm!(
             "frintn {x:h}, {x:h}",
@@ -76,6 +215,43 @@ pub fn rintf16(mut x: f16) -> f16 {
     x
 }
 
+pub fn trunc(mut x: f64) -> f64 {
+    // SAFETY: `frintz` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintz {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn truncf(mut x: f32) -> f32 {
+    // SAFETY: `frintz` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "frintz {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn truncf16(mut x: f16) -> f16 {
+    // SAFETY: `frintz` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    unsafe {
+        asm!(
+            "frintz {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
 pub fn sqrt(mut x: f64) -> f64 {
     // SAFETY: `fsqrt` is available with neon and has no side effects.
     unsafe {
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index d9f2aad66..ad8a950cb 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -26,15 +26,30 @@ cfg_if! {
         pub use aarch64::{
             fma,
             fmaf,
+            ceil,
+            ceilf,
+            floor,
+            floorf,
+            round,
+            roundf,
             rint,
             rintf,
+            roundeven,
+            roundevenf,
+            trun,
+            truncf
             sqrt,
             sqrtf,
         };
 
         #[cfg(all(f16_enabled, target_feature = "fp16"))]
         pub use aarch64::{
+            ceilf16,
+            floorf16,
+            roundf16,
             rintf16,
+            roundevenf16,
+            truncf16
             sqrtf16,
         };
     }
diff --git a/libm/src/math/ceil.rs b/libm/src/math/ceil.rs
index 4e1035457..47052f88a 100644
--- a/libm/src/math/ceil.rs
+++ b/libm/src/math/ceil.rs
@@ -4,6 +4,12 @@
 #[cfg(f16_enabled)]
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn ceilf16(x: f16) -> f16 {
+    select_implementation! {
+        name: ceilf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     super::generic::ceil(x)
 }
 
@@ -14,7 +20,10 @@ pub fn ceilf16(x: f16) -> f16 {
 pub fn ceilf(x: f32) -> f32 {
     select_implementation! {
         name: ceilf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         args: x,
     }
 
@@ -28,7 +37,10 @@ pub fn ceilf(x: f32) -> f32 {
 pub fn ceil(x: f64) -> f64 {
     select_implementation! {
         name: ceil,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         use_arch_required: all(target_arch = "x86", not(target_feature = "sse2")),
         args: x,
     }
diff --git a/libm/src/math/floor.rs b/libm/src/math/floor.rs
index 3c5eab101..52efb0a3c 100644
--- a/libm/src/math/floor.rs
+++ b/libm/src/math/floor.rs
@@ -4,6 +4,12 @@
 #[cfg(f16_enabled)]
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn floorf16(x: f16) -> f16 {
+    select_implementation! {
+        name: floorf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     return super::generic::floor(x);
 }
 
@@ -14,7 +20,10 @@ pub fn floorf16(x: f16) -> f16 {
 pub fn floor(x: f64) -> f64 {
     select_implementation! {
         name: floor,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         use_arch_required: all(target_arch = "x86", not(target_feature = "sse2")),
         args: x,
     }
@@ -29,7 +38,10 @@ pub fn floor(x: f64) -> f64 {
 pub fn floorf(x: f32) -> f32 {
     select_implementation! {
         name: floorf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         args: x,
     }
 
diff --git a/libm/src/math/round.rs b/libm/src/math/round.rs
index 6cd091cd7..df10c5563 100644
--- a/libm/src/math/round.rs
+++ b/libm/src/math/round.rs
@@ -2,18 +2,36 @@
 #[cfg(f16_enabled)]
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn roundf16(x: f16) -> f16 {
+    select_implementation! {
+        name: roundf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     super::generic::round(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties away from zero.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn roundf(x: f32) -> f32 {
+    select_implementation! {
+        name: roundf,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     super::generic::round(x)
 }
 
 /// Round `x` to the nearest integer, breaking ties away from zero.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn round(x: f64) -> f64 {
+    select_implementation! {
+        name: round,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     super::generic::round(x)
 }
 
diff --git a/libm/src/math/roundeven.rs b/libm/src/math/roundeven.rs
index 6e621d762..f5916ec4f 100644
--- a/libm/src/math/roundeven.rs
+++ b/libm/src/math/roundeven.rs
@@ -5,6 +5,12 @@ use super::support::{Float, Round};
 #[cfg(f16_enabled)]
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn roundevenf16(x: f16) -> f16 {
+    select_implementation! {
+        name: roundevenf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     roundeven_impl(x)
 }
 
@@ -12,6 +18,12 @@ pub fn roundevenf16(x: f16) -> f16 {
 /// `roundToIntegralTiesToEven`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn roundevenf(x: f32) -> f32 {
+    select_implementation! {
+        name: roundevenf,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     roundeven_impl(x)
 }
 
@@ -19,6 +31,12 @@ pub fn roundevenf(x: f32) -> f32 {
 /// `roundToIntegralTiesToEven`.
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn roundeven(x: f64) -> f64 {
+    select_implementation! {
+        name: roundeven,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     roundeven_impl(x)
 }
 
diff --git a/libm/src/math/trunc.rs b/libm/src/math/trunc.rs
index fa50d55e1..ab87b2ae0 100644
--- a/libm/src/math/trunc.rs
+++ b/libm/src/math/trunc.rs
@@ -4,6 +4,12 @@
 #[cfg(f16_enabled)]
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn truncf16(x: f16) -> f16 {
+    select_implementation! {
+        name: truncf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
     super::generic::trunc(x)
 }
 
@@ -14,7 +20,10 @@ pub fn truncf16(x: f16) -> f16 {
 pub fn truncf(x: f32) -> f32 {
     select_implementation! {
         name: truncf,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         args: x,
     }
 
@@ -28,7 +37,10 @@ pub fn truncf(x: f32) -> f32 {
 pub fn trunc(x: f64) -> f64 {
     select_implementation! {
         name: trunc,
-        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
         args: x,
     }
 

From 40b4180db48de910531cbaa5b237279150badbac Mon Sep 17 00:00:00 2001
From: Tobias Decking <Tobias.Decking@gmail.com>
Date: Fri, 2 May 2025 09:30:29 +0200
Subject: [PATCH 2/3] export fix

---
 libm/src/math/arch/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index ad8a950cb..248ab9dbf 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -36,8 +36,8 @@ cfg_if! {
             rintf,
             roundeven,
             roundevenf,
-            trun,
-            truncf
+            trunc,
+            truncf,
             sqrt,
             sqrtf,
         };
@@ -49,7 +49,7 @@ cfg_if! {
             roundf16,
             rintf16,
             roundevenf16,
-            truncf16
+            truncf16,
             sqrtf16,
         };
     }

From ca8629e2c802c004f45ed61e3ea34344ab1beb18 Mon Sep 17 00:00:00 2001
From: Tobias Decking <Tobias.Decking@gmail.com>
Date: Fri, 2 May 2025 09:58:35 +0200
Subject: [PATCH 3/3] final fix

---
 etc/function-definitions.json | 15 +++++++++++++++
 libm/src/math/round.rs        |  4 ++--
 libm/src/math/roundeven.rs    |  4 ++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/etc/function-definitions.json b/etc/function-definitions.json
index 9e5774eaf..518fdf35c 100644
--- a/etc/function-definitions.json
+++ b/etc/function-definitions.json
@@ -98,6 +98,7 @@
     },
     "ceil": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/i586.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/ceil.rs",
@@ -107,6 +108,7 @@
     },
     "ceilf": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/ceil.rs",
             "libm/src/math/generic/ceil.rs"
@@ -122,6 +124,7 @@
     },
     "ceilf16": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/ceil.rs",
             "libm/src/math/generic/ceil.rs"
         ],
@@ -311,6 +314,7 @@
     },
     "floor": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/i586.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/floor.rs",
@@ -320,6 +324,7 @@
     },
     "floorf": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/floor.rs",
             "libm/src/math/generic/floor.rs"
@@ -335,6 +340,7 @@
     },
     "floorf16": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/floor.rs",
             "libm/src/math/generic/floor.rs"
         ],
@@ -815,6 +821,7 @@
     },
     "round": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/generic/round.rs",
             "libm/src/math/round.rs"
         ],
@@ -822,12 +829,14 @@
     },
     "roundeven": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/roundeven.rs"
         ],
         "type": "f64"
     },
     "roundevenf": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/roundeven.rs"
         ],
         "type": "f32"
@@ -840,12 +849,14 @@
     },
     "roundevenf16": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/roundeven.rs"
         ],
         "type": "f16"
     },
     "roundf": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/generic/round.rs",
             "libm/src/math/round.rs"
         ],
@@ -860,6 +871,7 @@
     },
     "roundf16": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/generic/round.rs",
             "libm/src/math/round.rs"
         ],
@@ -1002,6 +1014,7 @@
     },
     "trunc": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/generic/trunc.rs",
             "libm/src/math/trunc.rs"
@@ -1010,6 +1023,7 @@
     },
     "truncf": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/arch/wasm32.rs",
             "libm/src/math/generic/trunc.rs",
             "libm/src/math/trunc.rs"
@@ -1025,6 +1039,7 @@
     },
     "truncf16": {
         "sources": [
+            "libm/src/math/arch/aarch64.rs",
             "libm/src/math/generic/trunc.rs",
             "libm/src/math/trunc.rs"
         ],
diff --git a/libm/src/math/round.rs b/libm/src/math/round.rs
index df10c5563..335857cb9 100644
--- a/libm/src/math/round.rs
+++ b/libm/src/math/round.rs
@@ -16,7 +16,7 @@ pub fn roundf16(x: f16) -> f16 {
 pub fn roundf(x: f32) -> f32 {
     select_implementation! {
         name: roundf,
-        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
         args: x,
     }
 
@@ -28,7 +28,7 @@ pub fn roundf(x: f32) -> f32 {
 pub fn round(x: f64) -> f64 {
     select_implementation! {
         name: round,
-        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
         args: x,
     }
 
diff --git a/libm/src/math/roundeven.rs b/libm/src/math/roundeven.rs
index f5916ec4f..1a2e8ef99 100644
--- a/libm/src/math/roundeven.rs
+++ b/libm/src/math/roundeven.rs
@@ -20,7 +20,7 @@ pub fn roundevenf16(x: f16) -> f16 {
 pub fn roundevenf(x: f32) -> f32 {
     select_implementation! {
         name: roundevenf,
-        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
         args: x,
     }
 
@@ -33,7 +33,7 @@ pub fn roundevenf(x: f32) -> f32 {
 pub fn roundeven(x: f64) -> f64 {
     select_implementation! {
         name: roundeven,
-        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
         args: x,
     }