diff --git a/nimsimd.nimble b/nimsimd.nimble index 0bfdbd4..04093b0 100644 --- a/nimsimd.nimble +++ b/nimsimd.nimble @@ -1,4 +1,4 @@ -version = "1.3.0" +version = "1.3.1" author = "Ryan Oldenburg" description = "Pleasant Nim bindings for SIMD instruction sets." license = "MIT" diff --git a/src/nimsimd/avx.nim b/src/nimsimd/avx.nim index cd8c226..eb53ec3 100644 --- a/src/nimsimd/avx.nim +++ b/src/nimsimd/avx.nim @@ -3,9 +3,9 @@ import sse42 export sse42 type - M256* {.importc: "__m256", header: "immintrin.h".} = object - M256i* {.importc: "__m256i", header: "immintrin.h".} = object - M256d* {.importc: "__m256d", header: "immintrin.h".} = object + M256* {.importc: "__m256", header: "immintrin.h", bycopy.} = object + M256i* {.importc: "__m256i", header: "immintrin.h", bycopy.} = object + M256d* {.importc: "__m256d", header: "immintrin.h", bycopy.} = object {.push header: "immintrin.h".} @@ -157,9 +157,15 @@ func mm256_load_si256*(p: pointer): M256i {.importc: "_mm256_load_si256".} func mm256_loadu_pd*(p: pointer): M256d {.importc: "_mm256_loadu_pd".} -func mm256_loadu_ps*(p: pointer): M256 {.importc: "_mm256_loadu_ps".} +func mm256_loadu_ps(p: ptr float32): M256 {.importc: "_mm256_loadu_ps".} -func mm256_loadu_si256*(p: pointer): M256i {.importc: "_mm256_loadu_si256".} +template mm256_loadu_ps*(p: pointer): M256 = + mm256_loadu_ps(cast[ptr float32](p)) + +func mm256_loadu_si256(p: ptr M256i): M256i {.importc: "_mm256_loadu_si256".} + +template mm256_loadu_si256*(p: pointer): M256i = + mm256_loadu_si256(cast[ptr M256i](p)) func mm256_loadu2_m128*(hi, lo: pointer): M256 {.importc: "_mm256_loadu2_m128".} @@ -309,9 +315,15 @@ func mm256_store_si256*(p: pointer, a: M256i) {.importc: "_mm256_store_si256".} func mm256_storeu_pd*(p: pointer, a: M256d) {.importc: "_mm256_storeu_pd".} -func mm256_storeu_ps*(p: pointer, a: M256) {.importc: "_mm256_storeu_ps".} +func mm256_storeu_ps(p: ptr float32, a: M256) {.importc: "_mm256_storeu_ps".} + +template mm256_storeu_ps*(p: pointer, a: M256) = + mm256_storeu_ps(cast[ptr float32](p), a) + +func mm256_storeu_si256(p: ptr M256i, a: M256i) {.importc: "_mm256_storeu_si256".} -func mm256_storeu_si256*(p: pointer, a: M256i) {.importc: "_mm256_storeu_si256".} +template mm256_storeu_si256*(p: pointer, a: M256i) = + mm256_storeu_si256(cast[ptr M256i](p), a) func mm256_storeu2_m128*(hi, lo: pointer) {.importc: "_mm256_storeu2_m128".} diff --git a/src/nimsimd/avx2.nim b/src/nimsimd/avx2.nim index 885d49f..1aff039 100644 --- a/src/nimsimd/avx2.nim +++ b/src/nimsimd/avx2.nim @@ -164,11 +164,17 @@ func mm256_madd_epi16*(a, b: M256i): M256i {.importc: "_mm256_madd_epi16".} func mm256_maddubs_epi16*(a, b: M256i): M256i {.importc: "_mm256_maddubs_epi16".} -func mm256_maskload_epi32*(p: pointer, mask: M256i): M256i {.importc: "_mm256_maskload_epi32".} +func mm256_maskload_epi32(p: ptr int32, mask: M256i): M256i {.importc: "_mm256_maskload_epi32".} + +template mm256_maskload_epi32*(p: pointer, mask: M256i): M256i = + mm256_maskload_epi32(cast[ptr int32](p), mask) func mm256_maskload_epi64*(p: pointer, mask: M256i): M256i {.importc: "_mm256_maskload_epi64".} -func mm256_maskstore_epi32*(p: pointer, mask, a: M256i) {.importc: "_mm256_maskstore_epi32".} +func mm256_maskstore_epi32(p: ptr int32, mask, a: M256i) {.importc: "_mm256_maskstore_epi32".} + +template mm256_maskstore_epi32*(p: pointer, mask, a: M256i) = + mm256_maskstore_epi32(cast[ptr int32](p), mask, a) func mm256_maskstore_epi64*(p: pointer, mask, a: M256i) {.importc: "_mm256_maskstore_epi64".} diff --git a/src/nimsimd/runtimecheck.nim b/src/nimsimd/runtimecheck.nim index 69cf484..8fe32df 100644 --- a/src/nimsimd/runtimecheck.nim +++ b/src/nimsimd/runtimecheck.nim @@ -12,10 +12,10 @@ when defined(amd64): PCLMULQDQ SHA AES - CMPXCHG16B # Atomic CompareExchange 16-byte, avail. since Haswell + CMPXCHG16B F16C - BM1 - BM2 + BMI1 + BMI2 InstructionSetCheckInfo = object leaf, register, bit: int diff --git a/src/nimsimd/sse2.nim b/src/nimsimd/sse2.nim index 1fa43be..1f00539 100644 --- a/src/nimsimd/sse2.nim +++ b/src/nimsimd/sse2.nim @@ -1,9 +1,9 @@ ## SSE and SSE2 intrinsics type - M128* {.importc: "__m128", header: "xmmintrin.h".} = object - M128i* {.importc: "__m128i", header: "emmintrin.h".} = object - M128d* {.importc: "__m128d", header: "emmintrin.h".} = object + M128* {.importc: "__m128", header: "xmmintrin.h", bycopy.} = object + M128i* {.importc: "__m128i", header: "emmintrin.h", bycopy.} = object + M128d* {.importc: "__m128d", header: "emmintrin.h", bycopy.} = object template MM_SHUFFLE*(z, y, x, w: int | uint): int32 = ((z shl 6) or (y shl 4) or (x shl 2) or w).int32 @@ -191,7 +191,10 @@ func mm_store1_ps*(p: pointer, a: M128) {.importc: "_mm_store1_ps".} func mm_storer_ps*(p: pointer, a: M128) {.importc: "_mm_storer_ps".} -func mm_storeu_ps*(p: pointer, a: M128) {.importc: "_mm_storeu_ps".} +func mm_storeu_ps(p: ptr float32, a: M128) {.importc: "_mm_storeu_ps".} + +template mm_storeu_ps*(p: pointer, a: M128) = + mm_storeu_ps(cast[ptr float32](p), a) func mm_storeu_si16*(p: pointer, a: M128) {.importc: "_mm_storeu_si16".} @@ -419,13 +422,19 @@ func mm_load_pd1*(p: pointer): M128d {.importc: "_mm_load_pd1".} func mm_load_sd*(p: pointer): M128d {.importc: "_mm_load_sd".} -func mm_load_si128*(p: pointer): M128i {.importc: "_mm_load_si128".} +func mm_load_si128(p: ptr M128i): M128i {.importc: "_mm_load_si128".} + +template mm_load_si128*(p: pointer): M128i = + mm_load_si128(cast[ptr M128i](p)) func mm_load1_pd*(p: pointer): M128d {.importc: "_mm_load1_pd".} func mm_loadh_pd*(a: M128d, p: pointer): M128d {.importc: "_mm_loadh_pd".} -func mm_loadl_epi64*(p: pointer): M128i {.importc: "_mm_loadl_epi64".} +func mm_loadl_epi64(p: ptr M128i): M128i {.importc: "_mm_loadl_epi64".} + +template mm_loadl_epi64*(p: pointer): M128i = + mm_loadl_epi64(cast[ptr M128i](p)) func mm_loadl_pd*(a: M128d, p: pointer): M128i {.importc: "_mm_loadl_pd".} @@ -435,7 +444,10 @@ func mm_loadr_pd*(p: pointer): M128d {.importc: "_mm_loadr_pd".} func mm_loadu_pd*(p: pointer): M128d {.importc: "_mm_loadu_pd".} -func mm_loadu_si128*(p: pointer): M128i {.importc: "_mm_loadu_si128".} +func mm_loadu_si128(p: ptr M128i): M128i {.importc: "_mm_loadu_si128".} + +template mm_loadu_si128*(p: pointer): M128i = + mm_loadu_si128(cast[ptr M128i](p)) func mm_loadu_si32*(p: pointer): M128i {.importc: "_mm_loadu_si32".} @@ -595,7 +607,10 @@ func mm_storer_pd*(p: pointer, a: M128d) {.importc: "_mm_storer_pd".} func mm_storeu_pd*(p: pointer, a: M128d) {.importc: "_mm_storeu_pd".} -func mm_storeu_si128*(p: pointer, a: M128i) {.importc: "_mm_storeu_si128".} +func mm_storeu_si128(p: ptr M128i, a: M128i) {.importc: "_mm_storeu_si128".} + +template mm_storeu_si128*(p: pointer, a: M128i) = + mm_storeu_si128(cast[ptr M128i](p), a) func mm_storeu_si32*(p: pointer, a: M128i) {.importc: "_mm_storeu_si32".}