Skip to content

Commit

Permalink
Add inline assembly as an option
Browse files Browse the repository at this point in the history
  • Loading branch information
ogxd committed Nov 8, 2024
1 parent 57ddc68 commit d79e76c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 11 deletions.
29 changes: 29 additions & 0 deletions benches/read_beyond.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ mod arch {
vandq_s8(vld1q_s8(data as *const i8), vreinterpretq_s8_u8(mask))
}

#[inline(always)]
pub unsafe fn urbd_asm(data: *const State, len: usize) -> State {
// Stripped of page check for simplicity, might crash program
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices);
let oob_vector = vld1q_s8(data as *const i8); // asm to do
vandq_s8(oob_vector, vreinterpretq_s8_u8(mask))
}

#[inline(always)]
pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State {
let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr());
Expand Down Expand Up @@ -93,6 +102,17 @@ mod arch {
_mm_and_si128(_mm_loadu_si128(data), mask)
}

#[inline(always)]
pub unsafe fn urbd_asm(data: *const State, len: usize) -> State {
use std::arch::asm;
// Stripped of page check for simplicity, might crash program
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
let mut oob_vector: State;
asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) oob_vector, options(pure, nomem, nostack));
_mm_and_si128(oob_vector, mask)
}

#[inline(always)]
pub unsafe fn simd_masked_load(data: *const State, len: usize) -> State {
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Expand Down Expand Up @@ -134,6 +154,15 @@ fn benchmark(c: &mut Criterion) {
})
});

group.bench_function(format!("urbd_asm ({})", len), |b| {
b.iter(|| unsafe {
black_box(arch::urbd(
black_box(&test_data as *const arch::State),
black_box(len),
))
})
});

group.bench_function(format!("simd_masked_load ({})", len), |b| {
b.iter(|| unsafe {
black_box(arch::simd_masked_load(
Expand Down
18 changes: 9 additions & 9 deletions src/gxhash/platform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use core::mem::size_of;

pub(crate) const VECTOR_SIZE: usize = size_of::<State>();
// 4KiB is the default page size for most systems, and conservative for other systems such as macOS ARM (16KiB)
// const PAGE_SIZE: usize = 0x1000;
const PAGE_SIZE: usize = 0x1000;

#[inline(always)]
pub unsafe fn get_partial(p: *const State, len: usize) -> State {
Expand All @@ -26,14 +26,14 @@ pub unsafe fn get_partial(p: *const State, len: usize) -> State {
get_partial_safe(p, len)
}

// #[inline(always)]
// unsafe fn check_same_page(ptr: *const State) -> bool {
// let address = ptr as usize;
// // Mask to keep only the last 12 bits
// let offset_within_page = address & (PAGE_SIZE - 1);
// // Check if the 16th byte from the current offset exceeds the page boundary
// offset_within_page < PAGE_SIZE - VECTOR_SIZE
// }
#[inline(always)]
unsafe fn check_same_page(ptr: *const State) -> bool {
let address = ptr as usize;
// Mask to keep only the last 12 bits
let offset_within_page = address & (PAGE_SIZE - 1);
// Check if the 16th byte from the current offset exceeds the page boundary
offset_within_page < PAGE_SIZE - VECTOR_SIZE
}

#[inline(always)]
pub unsafe fn finalize(hash: State) -> State {
Expand Down
15 changes: 14 additions & 1 deletion src/gxhash/platform/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
// _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))

// Using URBD
//get_partial_unsafe(data, len)

// Using simd_masked_load
// let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
// let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
Expand All @@ -50,13 +53,23 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
let data: Simd<i8, 16> = Simd::<i8, 16>::load_or_default(&slice);
let vector: State = transmute(data);
return vector;

// Using inline assembly to load out-of-bounds
// use std::arch::asm;
// let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
// let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
// let mut result: State;
// asm!("movdqu [{}], {}", in(reg) data, out(xmm_reg) result, options(pure, nomem, nostack));
// let partial_vector = _mm_and_si128(result, mask);
// _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}

#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask);
let d: __m128i = _mm_loadu_si128(data);
let partial_vector = _mm_and_si128(d, mask);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}

Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// #![feature(core_intrinsics)]
#![feature(core_intrinsics)]
#![feature(portable_simd)]
#![cfg_attr(not(feature = "std"), no_std)]
// Hybrid SIMD width usage currently requires unstable 'stdsimd'
Expand Down

0 comments on commit d79e76c

Please sign in to comment.