|  | 
| 6 | 6 |     rustc_attrs, | 
| 7 | 7 |     intrinsics, | 
| 8 | 8 |     core_intrinsics, | 
| 9 |  | -    repr_simd | 
|  | 9 | +    repr_simd, | 
|  | 10 | +    f16, | 
|  | 11 | +    f128 | 
| 10 | 12 | )] | 
| 11 |  | -#![allow(incomplete_features, internal_features)] | 
|  | 13 | +#![allow(incomplete_features, internal_features, non_camel_case_types)] | 
|  | 14 | +use std::fmt::{self, Debug, Formatter}; | 
| 12 | 15 | use std::intrinsics::simd as intrinsics; | 
| 13 | 16 | use std::ptr; | 
| 14 | 17 | use std::simd::StdFloat; | 
| 15 | 18 | use std::simd::prelude::*; | 
| 16 | 19 | 
 | 
|  | 20 | +#[repr(simd, packed)] | 
|  | 21 | +#[derive(Copy)] | 
|  | 22 | +struct PackedSimd<T, const N: usize>([T; N]); | 
|  | 23 | + | 
|  | 24 | +impl<T: Copy, const N: usize> Clone for PackedSimd<T, N> { | 
|  | 25 | +    fn clone(&self) -> Self { | 
|  | 26 | +        *self | 
|  | 27 | +    } | 
|  | 28 | +} | 
|  | 29 | + | 
|  | 30 | +impl<T: PartialEq + Copy, const N: usize> PartialEq for PackedSimd<T, N> { | 
|  | 31 | +    fn eq(&self, other: &Self) -> bool { | 
|  | 32 | +        self.into_array() == other.into_array() | 
|  | 33 | +    } | 
|  | 34 | +} | 
|  | 35 | + | 
|  | 36 | +impl<T: Debug + Copy, const N: usize> Debug for PackedSimd<T, N> { | 
|  | 37 | +    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { | 
|  | 38 | +        Debug::fmt(&self.into_array(), f) | 
|  | 39 | +    } | 
|  | 40 | +} | 
|  | 41 | + | 
|  | 42 | +type f16x2 = PackedSimd<f16, 2>; | 
|  | 43 | +type f16x4 = PackedSimd<f16, 4>; | 
|  | 44 | + | 
|  | 45 | +type f128x2 = PackedSimd<f128, 2>; | 
|  | 46 | +type f128x4 = PackedSimd<f128, 4>; | 
|  | 47 | + | 
|  | 48 | +impl<T: Copy, const N: usize> PackedSimd<T, N> { | 
|  | 49 | +    fn splat(x: T) -> Self { | 
|  | 50 | +        Self([x; N]) | 
|  | 51 | +    } | 
|  | 52 | +    fn from_array(a: [T; N]) -> Self { | 
|  | 53 | +        Self(a) | 
|  | 54 | +    } | 
|  | 55 | +    fn into_array(self) -> [T; N] { | 
|  | 56 | +        // as we have `repr(packed)`, there shouldn't be any padding bytes | 
|  | 57 | +        unsafe { std::mem::transmute_copy(&self) } | 
|  | 58 | +    } | 
|  | 59 | +} | 
|  | 60 | + | 
| 17 | 61 | #[rustc_intrinsic] | 
| 18 | 62 | #[rustc_nounwind] | 
| 19 | 63 | pub unsafe fn simd_shuffle_const_generic<T, U, const IDX: &'static [u32]>(x: T, y: T) -> U; | 
| 20 | 64 | 
 | 
|  | 65 | +pub fn simd_ops_f16() { | 
|  | 66 | +    use intrinsics::*; | 
|  | 67 | + | 
|  | 68 | +    // small hack to make type inference better | 
|  | 69 | +    macro_rules! assert_eq { | 
|  | 70 | +        ($a:expr, $b:expr $(,$t:tt)*) => {{ | 
|  | 71 | +            let a = $a; | 
|  | 72 | +            let b = $b; | 
|  | 73 | +            if false { let _inference = b == a; } | 
|  | 74 | +            ::std::assert_eq!(a, b, $(,$t)*) | 
|  | 75 | +        }} | 
|  | 76 | +    } | 
|  | 77 | + | 
|  | 78 | +    let a = f16x4::splat(10.0); | 
|  | 79 | +    let b = f16x4::from_array([1.0, 2.0, 3.0, -4.0]); | 
|  | 80 | + | 
|  | 81 | +    unsafe { | 
|  | 82 | +        assert_eq!(simd_neg(b), f16x4::from_array([-1.0, -2.0, -3.0, 4.0])); | 
|  | 83 | +        assert_eq!(simd_add(a, b), f16x4::from_array([11.0, 12.0, 13.0, 6.0])); | 
|  | 84 | +        assert_eq!(simd_sub(a, b), f16x4::from_array([9.0, 8.0, 7.0, 14.0])); | 
|  | 85 | +        assert_eq!(simd_mul(a, b), f16x4::from_array([10.0, 20.0, 30.0, -40.0])); | 
|  | 86 | +        assert_eq!(simd_div(b, a), f16x4::from_array([0.1, 0.2, 0.3, -0.4])); | 
|  | 87 | +        assert_eq!(simd_div(a, f16x4::splat(2.0)), f16x4::splat(5.0)); | 
|  | 88 | +        assert_eq!(simd_rem(a, b), f16x4::from_array([0.0, 0.0, 1.0, 2.0])); | 
|  | 89 | +        assert_eq!(simd_fabs(b), f16x4::from_array([1.0, 2.0, 3.0, 4.0])); | 
|  | 90 | +        assert_eq!( | 
|  | 91 | +            simd_fmax(a, simd_mul(b, f16x4::splat(4.0))), | 
|  | 92 | +            f16x4::from_array([10.0, 10.0, 12.0, 10.0]) | 
|  | 93 | +        ); | 
|  | 94 | +        assert_eq!( | 
|  | 95 | +            simd_fmin(a, simd_mul(b, f16x4::splat(4.0))), | 
|  | 96 | +            f16x4::from_array([4.0, 8.0, 10.0, -16.0]) | 
|  | 97 | +        ); | 
|  | 98 | + | 
|  | 99 | +        assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a)); | 
|  | 100 | +        assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a)); | 
|  | 101 | +        assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b)); | 
|  | 102 | +        assert_eq!( | 
|  | 103 | +            simd_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)), | 
|  | 104 | +            f16x4::splat(f16::NEG_INFINITY) | 
|  | 105 | +        ); | 
|  | 106 | + | 
|  | 107 | +        assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a)); | 
|  | 108 | +        assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a)); | 
|  | 109 | +        assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b)); | 
|  | 110 | +        assert_eq!( | 
|  | 111 | +            simd_relaxed_fma(f16x4::splat(-3.2), b, f16x4::splat(f16::NEG_INFINITY)), | 
|  | 112 | +            f16x4::splat(f16::NEG_INFINITY) | 
|  | 113 | +        ); | 
|  | 114 | + | 
|  | 115 | +        assert_eq!(simd_eq(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0])); | 
|  | 116 | +        assert_eq!(simd_ne(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0])); | 
|  | 117 | +        assert_eq!(simd_le(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0])); | 
|  | 118 | +        assert_eq!(simd_lt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0])); | 
|  | 119 | +        assert_eq!(simd_ge(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0])); | 
|  | 120 | +        assert_eq!(simd_gt(a, simd_mul(f16x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0])); | 
|  | 121 | + | 
|  | 122 | +        assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f16); | 
|  | 123 | +        assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f16); | 
|  | 124 | +        assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f16); | 
|  | 125 | +        assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f16); | 
|  | 126 | +        assert_eq!(simd_reduce_max(a), 10.0f16); | 
|  | 127 | +        assert_eq!(simd_reduce_max(b), 3.0f16); | 
|  | 128 | +        assert_eq!(simd_reduce_min(a), 10.0f16); | 
|  | 129 | +        assert_eq!(simd_reduce_min(b), -4.0f16); | 
|  | 130 | + | 
|  | 131 | +        assert_eq!( | 
|  | 132 | +            simd_fmax(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])), | 
|  | 133 | +            f16x2::from_array([0.0, 0.0]) | 
|  | 134 | +        ); | 
|  | 135 | +        assert_eq!(simd_reduce_max(f16x2::from_array([0.0, f16::NAN])), 0.0f16); | 
|  | 136 | +        assert_eq!(simd_reduce_max(f16x2::from_array([f16::NAN, 0.0])), 0.0f16); | 
|  | 137 | +        assert_eq!( | 
|  | 138 | +            simd_fmin(f16x2::from_array([0.0, f16::NAN]), f16x2::from_array([f16::NAN, 0.0])), | 
|  | 139 | +            f16x2::from_array([0.0, 0.0]) | 
|  | 140 | +        ); | 
|  | 141 | +        assert_eq!(simd_reduce_min(f16x2::from_array([0.0, f16::NAN])), 0.0f16); | 
|  | 142 | +        assert_eq!(simd_reduce_min(f16x2::from_array([f16::NAN, 0.0])), 0.0f16); | 
|  | 143 | +    } | 
|  | 144 | +} | 
|  | 145 | + | 
| 21 | 146 | fn simd_ops_f32() { | 
| 22 | 147 |     let a = f32x4::splat(10.0); | 
| 23 | 148 |     let b = f32x4::from_array([1.0, 2.0, 3.0, -4.0]); | 
| @@ -148,6 +273,87 @@ fn simd_ops_f64() { | 
| 148 | 273 |     assert_eq!(f64x2::from_array([f64::NAN, 0.0]).reduce_min(), 0.0); | 
| 149 | 274 | } | 
| 150 | 275 | 
 | 
|  | 276 | +pub fn simd_ops_f128() { | 
|  | 277 | +    use intrinsics::*; | 
|  | 278 | + | 
|  | 279 | +    // small hack to make type inference better | 
|  | 280 | +    macro_rules! assert_eq { | 
|  | 281 | +        ($a:expr, $b:expr $(,$t:tt)*) => {{ | 
|  | 282 | +            let a = $a; | 
|  | 283 | +            let b = $b; | 
|  | 284 | +            if false { let _inference = b == a; } | 
|  | 285 | +            ::std::assert_eq!(a, b, $(,$t)*) | 
|  | 286 | +        }} | 
|  | 287 | +    } | 
|  | 288 | + | 
|  | 289 | +    let a = f128x4::splat(10.0); | 
|  | 290 | +    let b = f128x4::from_array([1.0, 2.0, 3.0, -4.0]); | 
|  | 291 | + | 
|  | 292 | +    unsafe { | 
|  | 293 | +        assert_eq!(simd_neg(b), f128x4::from_array([-1.0, -2.0, -3.0, 4.0])); | 
|  | 294 | +        assert_eq!(simd_add(a, b), f128x4::from_array([11.0, 12.0, 13.0, 6.0])); | 
|  | 295 | +        assert_eq!(simd_sub(a, b), f128x4::from_array([9.0, 8.0, 7.0, 14.0])); | 
|  | 296 | +        assert_eq!(simd_mul(a, b), f128x4::from_array([10.0, 20.0, 30.0, -40.0])); | 
|  | 297 | +        assert_eq!(simd_div(b, a), f128x4::from_array([0.1, 0.2, 0.3, -0.4])); | 
|  | 298 | +        assert_eq!(simd_div(a, f128x4::splat(2.0)), f128x4::splat(5.0)); | 
|  | 299 | +        assert_eq!(simd_rem(a, b), f128x4::from_array([0.0, 0.0, 1.0, 2.0])); | 
|  | 300 | +        assert_eq!(simd_fabs(b), f128x4::from_array([1.0, 2.0, 3.0, 4.0])); | 
|  | 301 | +        assert_eq!( | 
|  | 302 | +            simd_fmax(a, simd_mul(b, f128x4::splat(4.0))), | 
|  | 303 | +            f128x4::from_array([10.0, 10.0, 12.0, 10.0]) | 
|  | 304 | +        ); | 
|  | 305 | +        assert_eq!( | 
|  | 306 | +            simd_fmin(a, simd_mul(b, f128x4::splat(4.0))), | 
|  | 307 | +            f128x4::from_array([4.0, 8.0, 10.0, -16.0]) | 
|  | 308 | +        ); | 
|  | 309 | + | 
|  | 310 | +        assert_eq!(simd_fma(a, b, a), simd_add(simd_mul(a, b), a)); | 
|  | 311 | +        assert_eq!(simd_fma(b, b, a), simd_add(simd_mul(b, b), a)); | 
|  | 312 | +        assert_eq!(simd_fma(a, b, b), simd_add(simd_mul(a, b), b)); | 
|  | 313 | +        assert_eq!( | 
|  | 314 | +            simd_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)), | 
|  | 315 | +            f128x4::splat(f128::NEG_INFINITY) | 
|  | 316 | +        ); | 
|  | 317 | + | 
|  | 318 | +        assert_eq!(simd_relaxed_fma(a, b, a), simd_add(simd_mul(a, b), a)); | 
|  | 319 | +        assert_eq!(simd_relaxed_fma(b, b, a), simd_add(simd_mul(b, b), a)); | 
|  | 320 | +        assert_eq!(simd_relaxed_fma(a, b, b), simd_add(simd_mul(a, b), b)); | 
|  | 321 | +        assert_eq!( | 
|  | 322 | +            simd_relaxed_fma(f128x4::splat(-3.2), b, f128x4::splat(f128::NEG_INFINITY)), | 
|  | 323 | +            f128x4::splat(f128::NEG_INFINITY) | 
|  | 324 | +        ); | 
|  | 325 | + | 
|  | 326 | +        assert_eq!(simd_eq(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, 0, 0])); | 
|  | 327 | +        assert_eq!(simd_ne(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, !0, !0])); | 
|  | 328 | +        assert_eq!(simd_le(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, !0, !0, 0])); | 
|  | 329 | +        assert_eq!(simd_lt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([0, 0, !0, 0])); | 
|  | 330 | +        assert_eq!(simd_ge(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, !0, 0, !0])); | 
|  | 331 | +        assert_eq!(simd_gt(a, simd_mul(f128x4::splat(5.0), b)), i32x4::from_array([!0, 0, 0, !0])); | 
|  | 332 | + | 
|  | 333 | +        assert_eq!(simd_reduce_add_ordered(a, 0.0), 40.0f128); | 
|  | 334 | +        assert_eq!(simd_reduce_add_ordered(b, 0.0), 2.0f128); | 
|  | 335 | +        assert_eq!(simd_reduce_mul_ordered(a, 1.0), 10000.0f128); | 
|  | 336 | +        assert_eq!(simd_reduce_mul_ordered(b, 1.0), -24.0f128); | 
|  | 337 | +        assert_eq!(simd_reduce_max(a), 10.0f128); | 
|  | 338 | +        assert_eq!(simd_reduce_max(b), 3.0f128); | 
|  | 339 | +        assert_eq!(simd_reduce_min(a), 10.0f128); | 
|  | 340 | +        assert_eq!(simd_reduce_min(b), -4.0f128); | 
|  | 341 | + | 
|  | 342 | +        assert_eq!( | 
|  | 343 | +            simd_fmax(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])), | 
|  | 344 | +            f128x2::from_array([0.0, 0.0]) | 
|  | 345 | +        ); | 
|  | 346 | +        assert_eq!(simd_reduce_max(f128x2::from_array([0.0, f128::NAN])), 0.0f128); | 
|  | 347 | +        assert_eq!(simd_reduce_max(f128x2::from_array([f128::NAN, 0.0])), 0.0f128); | 
|  | 348 | +        assert_eq!( | 
|  | 349 | +            simd_fmin(f128x2::from_array([0.0, f128::NAN]), f128x2::from_array([f128::NAN, 0.0])), | 
|  | 350 | +            f128x2::from_array([0.0, 0.0]) | 
|  | 351 | +        ); | 
|  | 352 | +        assert_eq!(simd_reduce_min(f128x2::from_array([0.0, f128::NAN])), 0.0f128); | 
|  | 353 | +        assert_eq!(simd_reduce_min(f128x2::from_array([f128::NAN, 0.0])), 0.0f128); | 
|  | 354 | +    } | 
|  | 355 | +} | 
|  | 356 | + | 
| 151 | 357 | fn simd_ops_i32() { | 
| 152 | 358 |     let a = i32x4::splat(10); | 
| 153 | 359 |     let b = i32x4::from_array([1, 2, 3, -4]); | 
| @@ -563,6 +769,31 @@ fn simd_gather_scatter() { | 
| 563 | 769 | } | 
| 564 | 770 | 
 | 
| 565 | 771 | fn simd_round() { | 
|  | 772 | +    unsafe { | 
|  | 773 | +        use intrinsics::*; | 
|  | 774 | + | 
|  | 775 | +        assert_eq!( | 
|  | 776 | +            simd_ceil(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 777 | +            f16x4::from_array([1.0, 2.0, 2.0, -4.0]) | 
|  | 778 | +        ); | 
|  | 779 | +        assert_eq!( | 
|  | 780 | +            simd_floor(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 781 | +            f16x4::from_array([0.0, 1.0, 2.0, -5.0]) | 
|  | 782 | +        ); | 
|  | 783 | +        assert_eq!( | 
|  | 784 | +            simd_round(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 785 | +            f16x4::from_array([1.0, 1.0, 2.0, -5.0]) | 
|  | 786 | +        ); | 
|  | 787 | +        assert_eq!( | 
|  | 788 | +            simd_round_ties_even(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 789 | +            f16x4::from_array([1.0, 1.0, 2.0, -4.0]) | 
|  | 790 | +        ); | 
|  | 791 | +        assert_eq!( | 
|  | 792 | +            simd_trunc(f16x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 793 | +            f16x4::from_array([0.0, 1.0, 2.0, -4.0]) | 
|  | 794 | +        ); | 
|  | 795 | +    } | 
|  | 796 | + | 
| 566 | 797 |     assert_eq!( | 
| 567 | 798 |         f32x4::from_array([0.9, 1.001, 2.0, -4.5]).ceil(), | 
| 568 | 799 |         f32x4::from_array([1.0, 2.0, 2.0, -4.0]) | 
| @@ -604,6 +835,31 @@ fn simd_round() { | 
| 604 | 835 |         f64x4::from_array([0.9, 1.001, 2.0, -4.5]).trunc(), | 
| 605 | 836 |         f64x4::from_array([0.0, 1.0, 2.0, -4.0]) | 
| 606 | 837 |     ); | 
|  | 838 | + | 
|  | 839 | +    unsafe { | 
|  | 840 | +        use intrinsics::*; | 
|  | 841 | + | 
|  | 842 | +        assert_eq!( | 
|  | 843 | +            simd_ceil(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 844 | +            f128x4::from_array([1.0, 2.0, 2.0, -4.0]) | 
|  | 845 | +        ); | 
|  | 846 | +        assert_eq!( | 
|  | 847 | +            simd_floor(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 848 | +            f128x4::from_array([0.0, 1.0, 2.0, -5.0]) | 
|  | 849 | +        ); | 
|  | 850 | +        assert_eq!( | 
|  | 851 | +            simd_round(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 852 | +            f128x4::from_array([1.0, 1.0, 2.0, -5.0]) | 
|  | 853 | +        ); | 
|  | 854 | +        assert_eq!( | 
|  | 855 | +            simd_round_ties_even(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 856 | +            f128x4::from_array([1.0, 1.0, 2.0, -4.0]) | 
|  | 857 | +        ); | 
|  | 858 | +        assert_eq!( | 
|  | 859 | +            simd_trunc(f128x4::from_array([0.9, 1.001, 2.0, -4.5])), | 
|  | 860 | +            f128x4::from_array([0.0, 1.0, 2.0, -4.0]) | 
|  | 861 | +        ); | 
|  | 862 | +    } | 
| 607 | 863 | } | 
| 608 | 864 | 
 | 
| 609 | 865 | fn simd_intrinsics() { | 
| @@ -724,8 +980,10 @@ fn simd_ops_non_pow2() { | 
| 724 | 980 | 
 | 
| 725 | 981 | fn main() { | 
| 726 | 982 |     simd_mask(); | 
|  | 983 | +    simd_ops_f16(); | 
| 727 | 984 |     simd_ops_f32(); | 
| 728 | 985 |     simd_ops_f64(); | 
|  | 986 | +    simd_ops_f128(); | 
| 729 | 987 |     simd_ops_i32(); | 
| 730 | 988 |     simd_ops_non_pow2(); | 
| 731 | 989 |     simd_cast(); | 
|  | 
0 commit comments