Skip to content

Commit

Permalink
Use REP MOVSB/STOSB when the ERMSB feature is present (#392)
Browse files Browse the repository at this point in the history
* Reorganize mem functions

This reduces the amount of platform-specific code

Signed-off-by: Joe Richey <[email protected]>

* Use ERMSB implementations if the feature is set

Signed-off-by: Joe Richey <[email protected]>

* Add non-aligned benchmarks

Signed-off-by: Joe Richey <[email protected]>
  • Loading branch information
josephlr authored Nov 3, 2020
1 parent 0f2271e commit 63c0091
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 88 deletions.
29 changes: 29 additions & 0 deletions src/mem/impls.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
use super::c_int;

#[inline(always)]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}

#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
}

#[inline(always)]
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c;
i += 1;
}
}
41 changes: 0 additions & 41 deletions src/mem/memcpy.rs

This file was deleted.

28 changes: 26 additions & 2 deletions src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};

// memcpy/memmove/memset have optimized implementations on some architectures
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
mod memcpy;
pub use self::memcpy::*;
mod impls;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
impls::copy_forward(dest, src, n);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= n {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
impls::copy_forward(dest, src, n);
} else {
impls::copy_backward(dest, src, n);
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
impls::set_bytes(s, c as u8, n);
s
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
Expand Down
58 changes: 37 additions & 21 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use super::c_int;

// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
Expand All @@ -13,11 +11,26 @@ use super::c_int;
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
// However, to avoid run-time feature detection, we don't use these byte-based
// instructions for most of the copying, preferring the qword variants.
//
// To simplify things, we switch to using the byte-based variants if the "ermsb"
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
asm!(
"rep movsb [rdi], [rsi]",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src => _,
options(nostack, preserves_flags)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= count {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
return self::memcpy(dest, src, count);
}
// copy backwards
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
options(nostack)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
asm!(
"rep stosb [rdi], al",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("al") c => _,
options(nostack, preserves_flags)
)
}

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
Expand All @@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
in("rax") (c as u8 as u64) * 0x0101010101010101,
in("rax") (c as u64) * 0x0101010101010101,
options(nostack, preserves_flags)
);
dest
}
80 changes: 56 additions & 24 deletions testcrate/benches/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,45 @@ use test::{black_box, Bencher};
extern crate compiler_builtins;
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};

fn memcpy_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
dst.copy_from_slice(src);
})
}

fn memcpy_rust(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
})
}

fn memset_builtin(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val: u8 = black_box(27);
for b in dst {
*b = val;
}
})
}

fn memset_rust(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val = black_box(27);
unsafe { memset(dst.as_mut_ptr(), val, n) }
})
Expand Down Expand Up @@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {

#[bench]
fn memcpy_builtin_4096(b: &mut Bencher) {
memcpy_builtin(b, 4096)
memcpy_builtin(b, 4096, 0)
}
#[bench]
fn memcpy_rust_4096(b: &mut Bencher) {
memcpy_rust(b, 4096)
memcpy_rust(b, 4096, 0)
}
#[bench]
fn memcpy_builtin_1048576(b: &mut Bencher) {
memcpy_builtin(b, 1048576)
memcpy_builtin(b, 1048576, 0)
}
#[bench]
fn memcpy_rust_1048576(b: &mut Bencher) {
memcpy_rust(b, 1048576)
memcpy_rust(b, 1048576, 0)
}
#[bench]
fn memcpy_builtin_4096_offset(b: &mut Bencher) {
memcpy_builtin(b, 4096, 65)
}
#[bench]
fn memcpy_rust_4096_offset(b: &mut Bencher) {
memcpy_rust(b, 4096, 65)
}
#[bench]
fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
memcpy_builtin(b, 1048576, 65)
}
#[bench]
fn memcpy_rust_1048576_offset(b: &mut Bencher) {
memcpy_rust(b, 1048576, 65)
}

#[bench]
fn memset_builtin_4096(b: &mut Bencher) {
memset_builtin(b, 4096)
memset_builtin(b, 4096, 0)
}
#[bench]
fn memset_rust_4096(b: &mut Bencher) {
memset_rust(b, 4096)
memset_rust(b, 4096, 0)
}
#[bench]
fn memset_builtin_1048576(b: &mut Bencher) {
memset_builtin(b, 1048576)
memset_builtin(b, 1048576, 0)
}
#[bench]
fn memset_rust_1048576(b: &mut Bencher) {
memset_rust(b, 1048576)
memset_rust(b, 1048576, 0)
}
#[bench]
fn memset_builtin_4096_offset(b: &mut Bencher) {
memset_builtin(b, 4096, 65)
}
#[bench]
fn memset_rust_4096_offset(b: &mut Bencher) {
memset_rust(b, 4096, 65)
}
#[bench]
fn memset_builtin_1048576_offset(b: &mut Bencher) {
memset_builtin(b, 1048576, 65)
}
#[bench]
fn memset_rust_1048576_offset(b: &mut Bencher) {
memset_rust(b, 1048576, 65)
}

#[bench]
Expand Down

0 comments on commit 63c0091

Please sign in to comment.