From 5fd8ca6e8eac1bbd6c1f93fe82e655dbf760e09f Mon Sep 17 00:00:00 2001 From: Eduard M <4310497+edisile@users.noreply.github.com> Date: Sun, 22 Jan 2023 23:03:27 +0100 Subject: [PATCH] Add bindings for more NEON instructions The SIMD implementation of Adler32 in github.com/guzba/crunchy currently does not compile on ARM due to missing bindings for the following instructions: - vld1q_lane_u32 - vpadalq_u16 - vaddw_u8 - vpadalq_u8 - vshlq_n_u32 - vmlal_u16 - vpadd_u32 I added the listed bindings and also took the opportunity to add bindings for all operand sizes within the respective family of instructions. --- src/nimsimd/neon.nim | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/nimsimd/neon.nim b/src/nimsimd/neon.nim index 15574fc..21e6a95 100644 --- a/src/nimsimd/neon.nim +++ b/src/nimsimd/neon.nim @@ -105,16 +105,37 @@ func vadd_u16*(a, b: uint16x4): uint16x4 func vadd_u32*(a, b: uint32x2): uint32x2 func vadd_u64*(a, b: uint64x1): uint64x1 +func vpadd_u8*(a, b: uint8x8): uint8x8 +func vpadd_u16*(a, b: uint16x4): uint16x4 +func vpadd_u32*(a, b: uint32x2): uint32x2 + func vaddq_u8*(a, b: uint8x16): uint8x16 func vaddq_u16*(a, b: uint16x8): uint16x8 func vaddq_u32*(a, b: uint32x4): uint32x4 func vaddq_u64*(a, b: uint64x2): uint64x2 +func vpadalq_u8*(a: uint16x8, b: uint8x16): uint16x8 +func vpadalq_u16*(a: uint32x4, b: uint16x8): uint32x4 +func vpadalq_u32*(a: uint64x2, b: uint32x4): uint64x2 + +func vaddw_u8*(a: uint16x8, b: uint8x8): uint16x8 +func vaddw_u16*(a: uint32x4, b: uint16x4): uint32x4 +func vaddw_u32*(a: uint64x2, b: uint32x2): uint64x2 + +func vmlal_u8*(a: uint16x8, b, c: uint8x8): uint16x8 +func vmlal_u16*(a: uint32x4, b, c: uint16x4): uint32x4 +func vmlal_u32*(a: uint64x2, b, c: uint32x2): uint64x2 + func vst1q_lane_u8*(p: pointer, v: uint8x16, lane: int) func vst1q_lane_u16*(p: pointer, v: uint16x8, lane: int) func vst1q_lane_u32*(p: pointer, v: uint32x4, lane: int) func vst1q_lane_u64*(p: pointer, v: uint64x2, lane: int) +func vld1q_lane_u8*(p: pointer, v: uint8x16, lane: int): uint8x16 +func vld1q_lane_u16*(p: pointer, v: uint16x8, lane: int): uint16x8 +func vld1q_lane_u32*(p: pointer, v: uint32x4, lane: int): uint32x4 +func vld1q_lane_u64*(p: pointer, v: uint64x2, lane: int): uint64x2 + func vst1q_u8*(p: pointer, v: uint8x16) func vst1q_u16*(p: pointer, v: uint16x8) func vst1q_u32*(p: pointer, v: uint32x4) @@ -218,6 +239,11 @@ func vshrq_n_u16*(a: uint16x8, n: int): uint16x8 func vshrq_n_u32*(a: uint32x4, n: int): uint32x4 func vshrq_n_u64*(a: uint64x2, n: int): uint64x2 +func vshlq_n_u8*(a: uint8x16, n: int): uint8x16 +func vshlq_n_u16*(a: uint16x8, n: int): uint16x8 +func vshlq_n_u32*(a: uint32x4, n: int): uint32x4 +func vshlq_n_u64*(a: uint64x2, n: int): uint64x2 + func vrshrq_n_u8*(a: uint8x16, n: int): uint8x16 func vrshrq_n_u16*(a: uint16x8, n: int): uint16x8 func vrshrq_n_u32*(a: uint32x4, n: int): uint32x4