Skip to content

Commit

Permalink
Merge pull request #30 from guzba/ryan
Browse files Browse the repository at this point in the history
more neon
  • Loading branch information
guzba authored Nov 23, 2024
2 parents be70f82 + 267bc92 commit 756463a
Showing 1 changed file with 170 additions and 16 deletions.
186 changes: 170 additions & 16 deletions src/nimsimd/neon.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
{.push header: "arm_neon.h".}

type
int32x2* {.importc: "int32x2_t".} = object
int32x4* {.importc: "int32x4_t".} = object

uint8x16* {.importc: "uint8x16_t".} = object
uint16x8* {.importc: "uint16x8_t".} = object
uint32x4* {.importc: "uint32x4_t".} = object
Expand All @@ -12,6 +15,14 @@ type
uint32x2* {.importc: "uint32x2_t".} = object
uint64x1* {.importc: "uint64x1_t".} = object

float32x4* {.importc: "float32x4_t".} = object

float32x2* {.importc: "float32x2_t".} = object
float64x2* {.importc: "float64x2_t".} = object

int32x4x2* {.importc: "int32x4x2_t".} = object
val*: array[2, int32x4]

uint8x16x2* {.importc: "uint8x16x2_t".} = object
val*: array[2, uint8x16]
uint16x8x2* {.importc: "uint16x8x2_t".} = object
Expand Down Expand Up @@ -66,26 +77,38 @@ type
uint64x1x4* {.importc: "uint64x1x4_t".} = object
val*: array[4, uint64x1]

func vmov_n_f32*(a: float32): float32x2

func vmovq_n_u8*(a: uint8): uint8x16
func vmovq_n_u16*(a: uint16): uint16x8
func vmovq_n_u32*(a: uint32): uint32x4
func vmovq_n_u64*(a: uint64): uint64x2
func vmovq_n_f32*(a: float32): float32x4
func vmovq_n_f64*(a: float64): float64x2

func vmovq_n_s32*(a: int32): int32x4

func vmov_n_u8*(a: uint8): uint8x8
func vmov_n_u16*(a: uint16): uint16x4
func vmov_n_u32*(a: uint32): uint32x2
func vmov_n_u64*(a: uint64): uint64x1

func vld1q_s32*(p: pointer): int32x4

func vld1q_u8*(p: pointer): uint8x16
func vld1q_u16*(p: pointer): uint16x8
func vld1q_u32*(p: pointer): uint32x4
func vld1q_u64*(p: pointer): uint64x2

func vld1q_f32*(p: pointer): float32x4

func vld1_u8*(p: pointer): uint8x8
func vld1_u16*(p: pointer): uint16x4
func vld1_u32*(p: pointer): uint32x2
func vld1_u64*(p: pointer): uint64x1

func vld1_f32*(p: pointer): float32x2

func vceq_u8*(a, b: uint8x8): uint8x8
func vceq_u16*(a, b: uint16x4): uint16x4
func vceq_u32*(a, b: uint32x2): uint32x2
Expand All @@ -95,16 +118,31 @@ func vceqq_u8*(a, b: uint8x16): uint8x16
func vceqq_u16*(a, b: uint16x8): uint16x8
func vceqq_u32*(a, b: uint32x4): uint32x4
func vceqq_u64*(a, b: uint64x2): uint64x2
func vceqq_f32*(a, b: float32x4): uint32x4

func vcltq_u8*(a, b: uint8x16): uint8x16
func vcltq_u16*(a, b: uint16x8): uint16x8
func vcltq_u32*(a, b: uint32x4): uint32x4
func vcltq_u64*(a, b: uint64x2): uint64x2
func vcltq_f32*(a, b: float32x4): uint32x4

func vcgtq_f32*(a, b: float32x4): uint32x4
func vcgtq_u32*(a, b: uint32x4): uint32x4

func vcleq_f32*(a, b: float32x4): uint32x4
func vcleq_s32*(a, b: int32x4): uint32x4

func vcgeq_f32*(a, b: float32x4): uint32x4
func vcgeq_s32*(a, b: int32x4): uint32x4

func vdivq_f32*(a, b: float32x4): float32x4

func vpaddlq_u8*(a: uint8x16): uint16x8
func vpaddlq_u16*(a: uint16x8): uint32x4
func vpaddlq_u32*(a: uint32x4): uint64x2

func vpaddd_u64*(a: uint64x2): uint64

func vadd_u8*(a, b: uint8x8): uint8x8
func vadd_u16*(a, b: uint16x4): uint16x4
func vadd_u32*(a, b: uint32x2): uint32x2
Expand All @@ -118,6 +156,14 @@ func vaddq_u8*(a, b: uint8x16): uint8x16
func vaddq_u16*(a, b: uint16x8): uint16x8
func vaddq_u32*(a, b: uint32x4): uint32x4
func vaddq_u64*(a, b: uint64x2): uint64x2
func vaddq_f32*(a, b: float32x4): float32x4

func vadd_f32*(a, b: float32x2): float32x2

func vsub_f32*(a, b: float32x2): float32x2

func vmul_n_f32*(a: float32x2, b: float32): float32x2
func vmulq_n_f32*(a: float32x4, b: float32): float32x4

func vpadalq_u8*(a: uint16x8, b: uint8x16): uint16x8
func vpadalq_u16*(a: uint32x4, b: uint16x8): uint32x4
Expand All @@ -131,26 +177,33 @@ func vmlal_u8*(a: uint16x8, b, c: uint8x8): uint16x8
func vmlal_u16*(a: uint32x4, b, c: uint16x4): uint32x4
func vmlal_u32*(a: uint64x2, b, c: uint32x2): uint64x2

func vst1q_lane_u8*(p: pointer, v: uint8x16, lane: int)
func vst1q_lane_u16*(p: pointer, v: uint16x8, lane: int)
func vst1q_lane_u32*(p: pointer, v: uint32x4, lane: int)
func vst1q_lane_u64*(p: pointer, v: uint64x2, lane: int)
func vst1q_lane_u8*(p: pointer, v: uint8x16, lane: int32)
func vst1q_lane_u16*(p: pointer, v: uint16x8, lane: int32)
func vst1q_lane_u32*(p: pointer, v: uint32x4, lane: int32)
func vst1q_lane_u64*(p: pointer, v: uint64x2, lane: int32)

func vld1q_lane_u8*(p: pointer, v: uint8x16, lane: int): uint8x16
func vld1q_lane_u16*(p: pointer, v: uint16x8, lane: int): uint16x8
func vld1q_lane_u32*(p: pointer, v: uint32x4, lane: int): uint32x4
func vld1q_lane_u64*(p: pointer, v: uint64x2, lane: int): uint64x2
func vld1q_lane_u8*(p: pointer, v: uint8x16, lane: int32): uint8x16
func vld1q_lane_u16*(p: pointer, v: uint16x8, lane: int32): uint16x8
func vld1q_lane_u32*(p: pointer, v: uint32x4, lane: int32): uint32x4
func vld1q_lane_u64*(p: pointer, v: uint64x2, lane: int32): uint64x2

func vld1q_dup_f64*(p: pointer): float64x2

func vst1q_s32*(p: pointer, v: int32x4)

func vst1q_u8*(p: pointer, v: uint8x16)
func vst1q_u16*(p: pointer, v: uint16x8)
func vst1q_u32*(p: pointer, v: uint32x4)
func vst1q_u64*(p: pointer, v: uint64x2)
func vst1q_f32*(p: pointer, v: float32x4)

func vst1_u8*(p: pointer, v: uint8x8)
func vst1_u16*(p: pointer, v: uint16x4)
func vst1_u32*(p: pointer, v: uint32x2)
func vst1_u64*(p: pointer, v: uint64x1)

func vst1_f32*(p: pointer, v: float32x2)

func vst2_u8*(p: pointer, v: uint8x8x2)
func vst2_u16*(p: pointer, v: uint16x4x2)
func vst2_u32*(p: pointer, v: uint32x2x2)
Expand Down Expand Up @@ -190,11 +243,13 @@ func vget_low_u8*(a: uint8x16): uint8x8
func vget_low_u16*(a: uint16x8): uint16x4
func vget_low_u32*(a: uint32x4): uint32x2
func vget_low_u64*(a: uint64x2): uint64x1
func vget_low_f32*(a: float32x4): float32x2

func vget_high_u8*(a: uint8x16): uint8x8
func vget_high_u16*(a: uint16x8): uint16x4
func vget_high_u32*(a: uint32x4): uint32x2
func vget_high_u64*(a: uint64x2): uint64x1
func vget_high_f32*(a: float32x4): float32x2

func vld4_u8*(p: pointer): uint8x8x4
func vld4_u16*(p: pointer): uint16x4x4
Expand All @@ -206,6 +261,8 @@ func vld4q_u16*(p: pointer): uint16x8x4
func vld4q_u32*(p: pointer): uint32x4x4
func vld4q_u64*(p: pointer): uint64x2x4

func vst2q_s32*(p: pointer, a: int32x4x2)

func vst4q_u8*(p: pointer, a: uint8x16x4)
func vst4q_u16*(p: pointer, a: uint16x8x4)
func vst4q_u32*(p: pointer, a: uint32x4x4)
Expand All @@ -231,15 +288,20 @@ func vzip1q_u16*(a, b: uint16x8): uint16x8
func vzip1q_u32*(a, b: uint32x4): uint32x4
func vzip1q_u64*(a, b: uint64x2): uint64x2

func vget_lane_u8*(a: uint8x8, lane: int): uint8
func vget_lane_u16*(a: uint16x4, lane: int): uint16
func vget_lane_u32*(a: uint32x2, lane: int): uint32
func vget_lane_u64*(a: uint64x1, lane: int): uint64
func vget_lane_u8*(a: uint8x8, lane: int32): uint8
func vget_lane_u16*(a: uint16x4, lane: int32): uint16
func vget_lane_u32*(a: uint32x2, lane: int32): uint32
func vget_lane_u64*(a: uint64x1, lane: int32): uint64

func vget_lane_s32*(a: int32x2, lane: int32): int32

func vgetq_lane_s32*(a: int32x4, lane: int32): int32

func vgetq_lane_u8*(a: uint8x16, lane: int): uint8
func vgetq_lane_u16*(a: uint16x8, lane: int): uint16
func vgetq_lane_u32*(a: uint32x4, lane: int): uint32
func vgetq_lane_u64*(a: uint64x2, lane: int): uint64
func vgetq_lane_u8*(a: uint8x16, lane: int32): uint8
func vgetq_lane_u16*(a: uint16x8, lane: int32): uint16
func vgetq_lane_u32*(a: uint32x4, lane: int32): uint32
func vgetq_lane_u64*(a: uint64x2, lane: int32): uint64
func vgetq_lane_f32*(a: float32x4, lane: int32): float32

func vaddl_u8*(a, b: uint8x8): uint16x8
func vaddl_u16*(a, b: uint16x4): uint32x4
Expand Down Expand Up @@ -299,6 +361,7 @@ func vsubq_u8*(a, b: uint8x16): uint8x16
func vsubq_u16*(a, b: uint16x8): uint16x8
func vsubq_u32*(a, b: uint32x4): uint32x4
func vsubq_u64*(a, b: uint64x2): uint64x2
func vsubq_f32*(a, b: float32x4): float32x4

func vzip_u8*(a, b: uint8x8): uint8x8x2
func vzip_u16*(a, b: uint16x4): uint16x4x2
Expand All @@ -310,4 +373,95 @@ func vmovl_u32*(a: uint32x2): uint64x2

func vtbl1_u8*(a, idx: uint8x8): uint8x8

func vbslq_s32*(a: uint32x4, b, c: int32x4): int32x4
func vbslq_u8*(a, b, c: uint8x16): uint8x16
func vbslq_u32*(a, b, c: uint32x4): uint32x4
func vbslq_f32*(a: uint32x4, b, c: float32x4): float32x4

func vminq_u32*(a, b: uint32x4): uint32x4
func vminq_f32*(a, b: float32x4): float32x4
func vmaxq_f32*(a, b: float32x4): float32x4

func vmul_f32*(a, b: float32x2): float32x2
func vmulq_f32*(a, b: float32x4): float32x4

# func vcvtq_f32_u32*(a: uint32x4): float32x4
# func vcvtq_u32_f32*(a: float32x4): uint32x4
func vcvtq_s32_f32*(a: float32x4): int32x4
func vcvtq_f32_s32*(a: int32x4): float32x4

func vextq_u32*(a, b: uint32x4, n: int): uint32x4
func vextq_u64*(a, b: uint64x2, n: int): uint64x2
func vextq_f64*(a, b: float64x2, n: int): float64x2

func vminvq_u32*(a: uint32x4): uint32
func vminvq_f32*(a: float32x4): float32

func vmaxq_u8*(a, b: uint8x16): uint8x16

func vmaxvq_u8*(a: uint8x16): uint8
func vmaxvq_u32*(a: uint32x4): uint32
func vmaxvq_f32*(a: float32x4): float32

func vmvnq_u8*(a: uint8x16): uint8x16

func vmvnq_u32*(a: uint32x4): uint32x4

func vornq_u8*(a, b: uint8x16): uint8x16

func vqtbl1q_u8*(t: uint8x16, idx: uint8x16): uint8x16

func vcntq_u8*(a: uint8x16): uint8x16

func vpaddq_u32*(a, b: uint32x4): uint32x4

func vsetq_lane_u64*(a: uint64, b: uint64x2, lane: int32): uint64x2
func vsetq_lane_u8*(a: uint8, b: uint8x16, lane: int32): uint8x16

func vshlq_u32*(a, b: uint32x4): uint32x4

func vaddlvq_u32*(a: uint32x4): uint64

func vdupq_laneq_u32*(a: uint32x4, lane: int32): uint32x4

func vshrq_n_s32*(a: int32x4, n: int32): int32x4

func vtstq_u32*(a, b: uint32x4): uint32x4

func vcombine_f32*(a, b: float32x2): float32x4

func vfmaq_f32*(a, b, c: float32x4): float32x4
func vmlaq_f32*(a, b, c: float32x4): float32x4

func vmla_f32*(a, b, c: float32x2): float32x2
func vmla_n_f32*(a, b: float32x2, c: float32): float32x2
func vmlaq_n_f32*(a, b: float32x4, c: float32): float32x4

func vpadds_f32*(a: float32x2): float32

func vreinterpretq_u8_f32*(a: float32x4): uint8x16
func vreinterpretq_u32_f32*(a: float32x4): uint32x4
func vreinterpretq_f32_f64*(a: float64x2): float32x4
func vreinterpretq_u32_u64*(a: uint64x2): uint32x4
func vreinterpretq_u64_u32*(a: uint32x4): uint64x2
func vreinterpretq_u8_u32*(a: uint32x4): uint8x16
func vreinterpretq_u32_u8*(a: uint8x16): uint32x4
func vreinterpretq_u8_s32*(a: int32x4): uint8x16
func vreinterpretq_s32_u8*(a: uint8x16): int32x4
func vreinterpretq_s32_u32*(a: uint32x4): int32x4
func vreinterpretq_u16_u8*(a: uint8x16): uint16x8

func vreinterpret_u64_u8*(a: uint8x8): uint64x1
func vreinterpret_s32_u32*(a: uint32x2): int32x2
func vreinterpret_f32_u64*(a: uint64x1): float32x2

{.pop.}

func uint64x2_immediate*(hi, lo: static uint64): uint64x2 {.inline.} =
{.emit: [result, " = (uint64x2_t){", lo, ",", hi, "};"].}

func uint32x4_immediate*(v3, v2, v1, v0: static uint32): uint32x4 {.inline.} =
{.emit: [result, " = (uint32x4_t){", v0, ",", v1, ",", v2, ",", v3, "};"].}

func uint8x16_immediate*(v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0: static uint8): uint8x16 {.inline.} =
{.emit: [result, " = (uint8x16_t){", v0, ",", v1, ",", v2, ",", v3, ",", v4, ",", v5, ",", v6, ",", v7, ",", v8, ",", v9, ",", v10, ",", v11, ",", v12, ",", v13, ",", v14, ",", v15, "};"].}

0 comments on commit 756463a

Please sign in to comment.