Skip to content

Commit

Permalink
Add Hasher write_x primitives (#42)
Browse files Browse the repository at this point in the history
* Add specialized write_u32

* Fix hashset benchmark accurracy

* Implement all types

* Fix build

* Fix avx2

* Add mandatory 0 length check to avoid SIGSEGV and add test

* Version 2.3.0

---------

Co-authored-by: Olivier Giniaux <[email protected]>
  • Loading branch information
ogxd and ogxd authored Dec 10, 2023
1 parent e67b860 commit 8e109d9
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 42 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "gxhash"
authors = ["Olivier Giniaux"]
version = "2.2.5"
version = "2.3.0"
edition = "2021"
description = "GxHash non-cryptographic algorithm"
license = "MIT"
Expand Down
60 changes: 29 additions & 31 deletions benches/hashset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,68 +3,66 @@ use criterion::{criterion_group, criterion_main, Criterion};
use fnv::FnvHashSet;
use gxhash::*;
use twox_hash::xxh3;
use std::hash::Hash;
use std::collections::HashSet;
use std::hash::{BuildHasherDefault, BuildHasher};

fn hashmap_insertion(c: &mut Criterion) {

// Short keys
benchmark_for_string(c, "gxhash");

// Medium keys
benchmark_for_string(c, "https://github.com/ogxd/gxhash");

// Long keys
benchmark_for_string(c, "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");

// Very long keys
benchmark_for_string(c, "Lorem ipsum dolor sit amet. Aut maxime voluptas ab quae explicabo et odio repellendus sed excepturi laboriosam? Ut molestiae obcaecati aut labore voluptates sed voluptatem voluptas non omnis harum et harum impedit ea eligendi autem id magni modi. Quo quam velit et error voluptas ut beatae repellendus et aspernatur incidunt hic veritatis accusamus sed autem modi cum error rerum. Sit perspiciatis consequuntur est perferendis veritatis et velit illum? At illo dolorum et voluptas nihil in voluptatum quas non quidem eveniet vel modi odit et sint nesciunt. Eos dicta consequuntur et sunt animi qui porro accusantium sed nisi voluptatum sed consectetur quibusdam ut ipsum mollitia. Et cupiditate iure aut omnis quia aut necessitatibus illum qui voluptas eius ut nihil laboriosam sit voluptatibus voluptas et galisum libero. Ut explicabo odit et adipisci accusantium ut officiis obcaecati. Eum pariatur sunt et autem neque ut eligendi autem. Qui voluptas Quis ut ratione officiis et placeat repudiandae sed tempora vitae At maxime quidem vel iure distinctio. Et doloremque esse ex eius voluptas id voluptatem recusandae qui illum quia ut consectetur quibusdam ea nisi accusamus!");
fn hashset_contains(c: &mut Criterion) {
benchmark(c, "u32", 42u32);
benchmark(c, "u64", 42u64);
benchmark(c, "u128", 42u128);
benchmark(c, "small string", "gxhash".to_owned());
benchmark(c, "medium string","https://github.com/ogxd/gxhash".to_owned());
benchmark(c, "large string","Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.".to_owned());
benchmark(c, "huge string", "Lorem ipsum dolor sit amet. Aut maxime voluptas ab quae explicabo et odio repellendus sed excepturi laboriosam? Ut molestiae obcaecati aut labore voluptates sed voluptatem voluptas non omnis harum et harum impedit ea eligendi autem id magni modi. Quo quam velit et error voluptas ut beatae repellendus et aspernatur incidunt hic veritatis accusamus sed autem modi cum error rerum. Sit perspiciatis consequuntur est perferendis veritatis et velit illum? At illo dolorum et voluptas nihil in voluptatum quas non quidem eveniet vel modi odit et sint nesciunt. Eos dicta consequuntur et sunt animi qui porro accusantium sed nisi voluptatum sed consectetur quibusdam ut ipsum mollitia. Et cupiditate iure aut omnis quia aut necessitatibus illum qui voluptas eius ut nihil laboriosam sit voluptatibus voluptas et galisum libero. Ut explicabo odit et adipisci accusantium ut officiis obcaecati. Eum pariatur sunt et autem neque ut eligendi autem. Qui voluptas Quis ut ratione officiis et placeat repudiandae sed tempora vitae At maxime quidem vel iure distinctio. Et doloremque esse ex eius voluptas id voluptatem recusandae qui illum quia ut consectetur quibusdam ea nisi accusamus!".to_owned());
}

fn benchmark_for_string(c: &mut Criterion, string: &str) {
let mut group = c.benchmark_group(format!("HashSet<&str[{}]>", string.len()));
fn benchmark<T>(c: &mut Criterion, name: &str, value: T)
where T: Eq+PartialEq+Hash+Default
{
let mut group = c.benchmark_group(format!("HashSet/{}", name));

let mut set = HashSet::<String>::new();
let mut set = HashSet::<T>::new();
group.bench_function("Default Hasher", |b| {
iterate(b, string, &mut set);
iterate(b, &value, &mut set);
});

let mut set: HashSet::<String, GxBuildHasher> = GxHashSet::<String>::default();
let mut set: HashSet::<T, GxBuildHasher> = GxHashSet::<T>::default();
group.bench_function("GxHash", |b| {
iterate(b, string, &mut set);
iterate(b, &value, &mut set);
});

let mut set = AHashSet::<String>::default();
let mut set = AHashSet::<T>::default();
group.bench_function("AHash", |b| {
iterate(b, string, &mut set);
iterate(b, &value, &mut set);
});

let mut set = HashSet::<String, BuildHasherDefault<xxh3::Hash64>>::default();
let mut set = HashSet::<T, BuildHasherDefault<xxh3::Hash64>>::default();
group.bench_function("XxHash", |b| {
iterate(b, string, &mut set);
iterate(b, &value, &mut set);
});

let mut set = FnvHashSet::<String>::default();
let mut set = FnvHashSet::<T>::default();
group.bench_function("FNV-1a", |b| {
iterate(b, string, &mut set);
iterate(b, &value, &mut set);
});

group.finish();
}

#[inline(never)]
fn iterate<T>(b: &mut criterion::Bencher<'_>, string: &str, set: &mut HashSet<String, T>)
where T: BuildHasher
fn iterate<T, B>(b: &mut criterion::Bencher<'_>, value: &T, set: &mut HashSet<T, B>)
where B: BuildHasher, T: Eq+PartialEq+Hash+Default
{
// If hashmap is empty, it may skip hashing the key and simply return false
// So we add a single value to prevent this optimization
set.insert("some text".to_string());
set.insert(T::default());
b.iter(|| {
// We intentionally check on a string that is not present, otherwise there will be an
// We intentionally check on a value that is not present, otherwise there will be an
// additional equality check perform, diluting the hashing time and biasing the benchmark.
set.contains(string)
set.contains(criterion::black_box(value))
});
}

criterion_group!(benches, hashmap_insertion);
criterion_group!(benches, hashset_contains);
criterion_main!(benches);
10 changes: 5 additions & 5 deletions benches/throughput_criterion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ fn benchmark<F>(c: &mut BenchmarkGroup<WallTime>, data: &[u8], name: &str, deleg
let slice = &data[0..len]; // Aligned
// let slice = &data[1..len]; // Unaligned
c.bench_with_input(BenchmarkId::new(name, len), slice, |bencher, input| {
bencher.iter(|| delegate(input, 0))
bencher.iter(|| delegate(criterion::black_box(input), criterion::black_box(42)))
});
}
}
Expand All @@ -45,13 +45,13 @@ fn benchmark_all(c: &mut Criterion) {

// GxHash
let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" };
benchmark(&mut group, slice, algo_name, |data: &[u8], _: i32| -> u64 {
gxhash64(data, 0)
benchmark(&mut group, slice, algo_name, |data: &[u8], seed: i32| -> u64 {
gxhash64(data, seed as i64)
});

// AHash
let ahash_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0);
benchmark(&mut group, slice, "ahash", |data: &[u8], _: i32| -> u64 {
benchmark(&mut group, slice, "ahash", |data: &[u8], seed: i32| -> u64 {
let ahash_hasher = ahash::RandomState::with_seeds(seed as u64, 0, 0, 0);
ahash_hasher.hash_one(data)
});

Expand Down
9 changes: 5 additions & 4 deletions src/gxhash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State {
pub(crate) unsafe fn compress_all(input: &[u8]) -> State {

let len = input.len();

if len == 0 {
return create_empty();
}

let mut ptr = input.as_ptr() as *const State;

if len <= VECTOR_SIZE {
Expand Down Expand Up @@ -108,10 +113,6 @@ pub(crate) unsafe fn compress_all(input: &[u8]) -> State {
// Fast path when input length > 32 and <= 48
load_unaligned!(ptr, v0, v1);
compress(hash_vector, compress(v0, v1))
} else if len <= VECTOR_SIZE * 4 {
// Fast path when input length > 48 and <= 64
load_unaligned!(ptr, v0, v1, v2);
compress(hash_vector, compress(compress(v0, v1), v2))
} else {
// Input message is large and we can use the high ILP loop
compress_many(ptr, hash_vector, remaining_bytes)
Expand Down
52 changes: 52 additions & 0 deletions src/gxhash/platform/arm_128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,55 @@ pub unsafe fn finalize(hash: State) -> State {

ReinterpretUnion { uint8: hash }.int8
}

#[inline(always)]
pub unsafe fn load_u8(x: u8) -> State {
vreinterpretq_s8_u8(vdupq_n_u8(x))
}

#[inline(always)]
pub unsafe fn load_u16(x: u16) -> State {
vreinterpretq_s8_u16(vdupq_n_u16(x))
}

#[inline(always)]
pub unsafe fn load_u32(x: u32) -> State {
vreinterpretq_s8_u32(vdupq_n_u32(x))
}

#[inline(always)]
pub unsafe fn load_u64(x: u64) -> State {
vreinterpretq_s8_u64(vdupq_n_u64(x))
}

#[inline(always)]
pub unsafe fn load_u128(x: u128) -> State {
let ptr = &x as *const u128 as *const i8;
vld1q_s8(ptr)
}

#[inline(always)]
pub unsafe fn load_i8(x: i8) -> State {
vdupq_n_s8(x)
}

#[inline(always)]
pub unsafe fn load_i16(x: i16) -> State {
vreinterpretq_s8_s16(vdupq_n_s16(x))
}

#[inline(always)]
pub unsafe fn load_i32(x: i32) -> State {
vreinterpretq_s8_s32(vdupq_n_s32(x))
}

#[inline(always)]
pub unsafe fn load_i64(x: i64) -> State {
vreinterpretq_s8_s64(vdupq_n_s64(x))
}

#[inline(always)]
pub unsafe fn load_i128(x: i128) -> State {
let ptr = &x as *const i128 as *const i8;
vld1q_s8(ptr)
}
52 changes: 52 additions & 0 deletions src/gxhash/platform/x86_128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,55 @@ pub unsafe fn finalize(hash: State) -> State {

hash
}

#[inline(always)]
pub unsafe fn load_u8(x: u8) -> State {
_mm_set1_epi8(x as i8)
}

#[inline(always)]
pub unsafe fn load_u16(x: u16) -> State {
_mm_set1_epi16(x as i16)
}

#[inline(always)]
pub unsafe fn load_u32(x: u32) -> State {
_mm_set1_epi32(x as i32)
}

#[inline(always)]
pub unsafe fn load_u64(x: u64) -> State {
_mm_set1_epi64x(x as i64)
}

#[inline(always)]
pub unsafe fn load_u128(x: u128) -> State {
let ptr = &x as *const u128 as *const State;
_mm_loadu_si128(ptr)
}

#[inline(always)]
pub unsafe fn load_i8(x: i8) -> State {
_mm_set1_epi8(x)
}

#[inline(always)]
pub unsafe fn load_i16(x: i16) -> State {
_mm_set1_epi16(x)
}

#[inline(always)]
pub unsafe fn load_i32(x: i32) -> State {
_mm_set1_epi32(x)
}

#[inline(always)]
pub unsafe fn load_i64(x: i64) -> State {
_mm_set1_epi64x(x)
}

#[inline(always)]
pub unsafe fn load_i128(x: i128) -> State {
let ptr = &x as *const i128 as *const State;
_mm_loadu_si128(ptr)
}
54 changes: 54 additions & 0 deletions src/gxhash/platform/x86_256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,57 @@ pub unsafe fn finalize(hash: State) -> State {
let permuted = _mm256_permute2x128_si256(hash, hash, 0x21);
_mm256_xor_si256(hash, permuted)
}

#[inline(always)]
pub unsafe fn load_u8(x: u8) -> State {
_mm256_set1_epi8(x as i8)
}

#[inline(always)]
pub unsafe fn load_u16(x: u16) -> State {
_mm256_set1_epi16(x as i16)
}

#[inline(always)]
pub unsafe fn load_u32(x: u32) -> State {
_mm256_set1_epi32(x as i32)
}

#[inline(always)]
pub unsafe fn load_u64(x: u64) -> State {
_mm256_set1_epi64x(x as i64)
}

#[inline(always)]
pub unsafe fn load_u128(x: u128) -> State {
let ptr = &x as *const u128 as *const __m128i;
let s128 = _mm_loadu_si128(ptr);
_mm256_set_m128i(s128, s128)
}

#[inline(always)]
pub unsafe fn load_i8(x: i8) -> State {
_mm256_set1_epi8(x)
}

#[inline(always)]
pub unsafe fn load_i16(x: i16) -> State {
_mm256_set1_epi16(x)
}

#[inline(always)]
pub unsafe fn load_i32(x: i32) -> State {
_mm256_set1_epi32(x)
}

#[inline(always)]
pub unsafe fn load_i64(x: i64) -> State {
_mm256_set1_epi64x(x)
}

#[inline(always)]
pub unsafe fn load_i128(x: i128) -> State {
let ptr = &x as *const i128 as *const __m128i;
let s128 = _mm_loadu_si128(ptr);
_mm256_set_m128i(s128, s128)
}
31 changes: 30 additions & 1 deletion src/hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@ impl GxHasher {
}
}

macro_rules! write {
($name:ident, $type:ty, $load:expr) => {
#[inline]
fn $name(&mut self, value: $type) {
self.state = unsafe {
compress_fast($load(value), self.state)
};
}
}
}

impl Hasher for GxHasher {
#[inline]
fn finish(&self) -> u64 {
Expand All @@ -104,9 +115,19 @@ impl Hasher for GxHasher {

#[inline]
fn write(&mut self, bytes: &[u8]) {
// Improvement: only compress at this stage and finalize in finish
self.state = unsafe { compress_fast(compress_all(bytes), self.state) };
}

write!(write_u8, u8, load_u8);
write!(write_u16, u16, load_u16);
write!(write_u32, u32, load_u32);
write!(write_u64, u64, load_u64);
write!(write_u128, u128, load_u128);
write!(write_i8, i8, load_i8);
write!(write_i16, i16, load_i16);
write!(write_i32, i32, load_i32);
write!(write_i64, i64, load_i64);
write!(write_i128, i128, load_i128);
}

/// A builder for building GxHasher with randomized seeds by default, for improved DOS resistance.
Expand Down Expand Up @@ -160,6 +181,14 @@ mod tests {
assert!(hashset.insert("bye"));
}

#[test]
fn hasher_handles_empty_inputs() {
let mut hashset = GxHashSet::default();
// Getting a ptr from a Vec::<u8>::new() return a pointer with address of 1
// We must make sure we dont SIGSEGV in such case
assert!(hashset.insert(Vec::<u8>::new()));
}

// This is important for DOS resistance
#[test]
fn gxhashset_uses_default_gxhasherbuilder() {
Expand Down

0 comments on commit 8e109d9

Please sign in to comment.