Skip to content

Commit

Permalink
add aarch64 to CI matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
llogiq committed Sep 23, 2023
1 parent 18ce5c1 commit e1cb1ae
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 35 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
arch:
- i686
- x86_64
- aarch64
features:
- default
- runtime-dispatch-simd
Expand Down
55 changes: 20 additions & 35 deletions src/simd/x86_sse2.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
#[cfg(target_arch = "x86")]
use std::arch::x86::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
_mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
_mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_cvtsi128_si32, _mm_loadu_si128, _mm_sad_epu8,
_mm_set1_epi8, _mm_setzero_si128, _mm_shuffle_epi32, _mm_sub_epi8, _mm_xor_si128,
};

#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
_mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
_mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_cvtsi128_si32, _mm_loadu_si128, _mm_sad_epu8,
_mm_set1_epi8, _mm_setzero_si128, _mm_shuffle_epi32, _mm_sub_epi8, _mm_xor_si128,
};

#[target_feature(enable = "sse2")]
Expand All @@ -39,8 +21,8 @@ pub unsafe fn mm_cmpneq_epi8(a: __m128i, b: __m128i) -> __m128i {
}

const MASK: [u8; 32] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255,
];

#[target_feature(enable = "sse2")]
Expand Down Expand Up @@ -69,7 +51,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for _ in 0..255 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles)
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles),
);
offset += 16;
}
Expand All @@ -82,7 +64,7 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for _ in 0..128 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles)
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles),
);
offset += 16;
}
Expand All @@ -94,16 +76,16 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
for i in 0..(haystack.len() - offset) / 16 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset + i * 16), needles)
_mm_cmpeq_epi8(mm_from_offset(haystack, offset + i * 16), needles),
);
}
if haystack.len() % 16 != 0 {
counts = _mm_sub_epi8(
counts,
_mm_and_si128(
_mm_cmpeq_epi8(mm_from_offset(haystack, haystack.len() - 16), needles),
mm_from_offset(&MASK, haystack.len() % 16)
)
mm_from_offset(&MASK, haystack.len() % 16),
),
);
}
count += sum(&counts);
Expand All @@ -113,7 +95,10 @@ pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {

#[target_feature(enable = "sse2")]
unsafe fn is_leading_utf8_byte(u8s: __m128i) -> __m128i {
mm_cmpneq_epi8(_mm_and_si128(u8s, _mm_set1_epu8(0b1100_0000)), _mm_set1_epu8(0b1000_0000))
mm_cmpneq_epi8(
_mm_and_si128(u8s, _mm_set1_epu8(0b1100_0000)),
_mm_set1_epu8(0b1000_0000),
)
}

#[target_feature(enable = "sse2")]
Expand All @@ -130,7 +115,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for _ in 0..255 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset))
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset)),
);
offset += 16;
}
Expand All @@ -143,7 +128,7 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for _ in 0..128 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset))
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset)),
);
offset += 16;
}
Expand All @@ -155,16 +140,16 @@ pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
for i in 0..(utf8_chars.len() - offset) / 16 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset + i * 16))
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset + i * 16)),
);
}
if utf8_chars.len() % 16 != 0 {
counts = _mm_sub_epi8(
counts,
_mm_and_si128(
is_leading_utf8_byte(mm_from_offset(utf8_chars, utf8_chars.len() - 16)),
mm_from_offset(&MASK, utf8_chars.len() % 16)
)
mm_from_offset(&MASK, utf8_chars.len() % 16),
),
);
}
count += sum(&counts);
Expand Down

0 comments on commit e1cb1ae

Please sign in to comment.