From e270f6fcd8602a4380f1d9a09f59da8bbf59cbfc Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 16:48:14 +0200 Subject: [PATCH 1/9] Go for inline fallthrough --- src/gxhash/mod.rs | 154 +++++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 70 deletions(-) diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index c0ac0c8..c51aaab 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -4,7 +4,7 @@ mod platform; pub use platform::*; -#[inline] // To be disabled when profiling +#[inline(always)] // To be disabled when profiling pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 { unsafe { let p = &gxhash::<0>(input, seed) as *const state as *const u32; @@ -12,7 +12,7 @@ pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 { } } -#[inline] // To be disabled when profiling +#[inline(always)] // To be disabled when profiling pub fn gxhash0_64(input: &[u8], seed: i32) -> u64 { unsafe { let p = &gxhash::<0>(input, seed) as *const state as *const u64; @@ -20,7 +20,7 @@ pub fn gxhash0_64(input: &[u8], seed: i32) -> u64 { } } -#[inline] // To be disabled when profiling +#[inline(always)] // To be disabled when profiling pub fn gxhash1_32(input: &[u8], seed: i32) -> u32 { unsafe { let p = &gxhash::<1>(input, seed) as *const state as *const u32; @@ -28,7 +28,7 @@ pub fn gxhash1_32(input: &[u8], seed: i32) -> u32 { } } -#[inline] // To be disabled when profiling +#[inline(always)] // To be disabled when profiling pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 { unsafe { let p = &gxhash::<1>(input, seed) as *const state as *const u64; @@ -36,90 +36,104 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 { } } -#[inline] +const VECTOR_SIZE: isize = std::mem::size_of::() as isize; + + +#[inline(always)] +unsafe fn compress(a: state, b: state) -> state { + match N { + 0 => compress_0(a, b), + 1 => compress_1(a, b), + _ => compress_1(a, b) + } +} + +#[inline(always)] fn gxhash(input: &[u8], seed: i32) -> state { unsafe { - const VECTOR_SIZE: isize = std::mem::size_of::() as isize; - let len: isize = input.len() as isize; let p = input.as_ptr() as *const i8; - let mut v = p as *const state; + let v = p as *const state; + + let hash_vector = if len <= 16 { + get_partial(v, len) + } else if len < 128 { + gxhash_process_1::(v, create_empty(), len) + } else { + gxhash_process_8::(v, create_empty(), len) + }; - // Quick exit - if len <= VECTOR_SIZE { - let partial_vector = get_partial(v, len); - return finalize(partial_vector, seed); - } + finalize(hash_vector, seed) + } +} - let mut end_address: usize; - let mut remaining_blocks_count: isize = len / VECTOR_SIZE; - let mut hash_vector: state = create_empty(); +macro_rules! load_unaligned { + ($ptr:ident, $($var:ident),+) => { + $( + #[allow(unused_mut)] + let mut $var = load_unaligned($ptr); + $ptr = $ptr.offset(1); + )+ + }; +} - // Choose compression function depending on version. - // Lower is faster, higher is more collision resistant. - let c = match N { - 0 => compress_0, - 1 => compress_1, - _ => compress_1 - }; +#[inline(always)] +unsafe fn gxhash_process_8(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state { - macro_rules! load_unaligned { - ($($var:ident),+) => { - $( - #[allow(unused_mut)] - let mut $var = load_unaligned(v); - v = v.offset(1); - )+ - }; - } + const UNROLL_FACTOR: isize = 8; - const UNROLL_FACTOR: isize = 8; - if len >= VECTOR_SIZE * UNROLL_FACTOR { + let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; + let end_address = v.offset(unrollable_blocks_count as isize) as usize; - let unrollable_blocks_count: isize = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR; - end_address = v.offset(unrollable_blocks_count) as usize; - - load_unaligned!(s0, s1, s2, s3, s4, s5, s6, s7); - - while (v as usize) < end_address { - - load_unaligned!(v0, v1, v2, v3, v4, v5, v6, v7); - - prefetch(v); - - s0 = c(s0, v0); - s1 = c(s1, v1); - s2 = c(s2, v2); - s3 = c(s3, v3); - s4 = c(s4, v4); - s5 = c(s5, v5); - s6 = c(s6, v6); - s7 = c(s7, v7); - } + load_unaligned!(v, s0, s1, s2, s3, s4, s5, s6, s7); + + while (v as usize) < end_address { - let a = c(c(s0, s1), c(s2, s3)); - let b = c(c(s4, s5), c(s6, s7)); - hash_vector = c(a, b); + load_unaligned!(v, v0, v1, v2, v3, v4, v5, v6, v7); + + prefetch(v); + + s0 = compress::(s0, v0); + s1 = compress::(s1, v1); + s2 = compress::(s2, v2); + s3 = compress::(s3, v3); + s4 = compress::(s4, v4); + s5 = compress::(s5, v5); + s6 = compress::(s6, v6); + s7 = compress::(s7, v7); + } - remaining_blocks_count -= unrollable_blocks_count; - } + let a = compress::(compress::(s0, s1), compress::(s2, s3)); + let b = compress::(compress::(s4, s5), compress::(s6, s7)); + let hash_vector = compress::(hash_vector, compress::(a, b)); - end_address = v.offset(remaining_blocks_count) as usize; + gxhash_process_1::(v, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE) +} - while likely((v as usize) < end_address) { - load_unaligned!(v0); - hash_vector = c(hash_vector, v0); - } +#[inline(always)] +unsafe fn gxhash_process_1(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state { + + let end_address = v.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize; - let remaining_bytes = len & (VECTOR_SIZE - 1); - if likely(remaining_bytes > 0) { - let partial_vector = get_partial(v, remaining_bytes); - hash_vector = c(hash_vector, partial_vector); - } + let mut hash_vector = hash_vector; + while (v as usize) < end_address { + load_unaligned!(v, v0); + hash_vector = compress::(hash_vector, v0); + } - finalize(hash_vector, seed) + let remaining_bytes = remaining_bytes & (VECTOR_SIZE - 1); + if remaining_bytes > 0 { + hash_vector = gxhash_process_last::(v, hash_vector, remaining_bytes); } + hash_vector +} + +#[inline(always)] +unsafe fn gxhash_process_last(v: *const state, hash_vector: state, remaining_bytes: isize) -> state { + + let partial_vector = get_partial(v, remaining_bytes); + compress::(hash_vector, partial_vector) } #[cfg(test)] From 375f205dc59417442404fcc2db3f379d15253921 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 17:51:35 +0200 Subject: [PATCH 2/9] Parametrize sparse collision test with rstest crate --- Cargo.toml | 1 + src/gxhash/mod.rs | 36 ++++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ba71849..604e391 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ debug = true rand = "0.8" [dev-dependencies] +rstest = "0.18.2" criterion = { version = "0.5.1" } lazy_static = { version = "1.3" } ahash = "0.8.3" diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index c51aaab..1a8d276 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -38,7 +38,6 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 { const VECTOR_SIZE: isize = std::mem::size_of::() as isize; - #[inline(always)] unsafe fn compress(a: state, b: state) -> state { match N { @@ -141,6 +140,7 @@ mod tests { use super::*; use rand::Rng; + use rstest::rstest; #[test] fn all_blocks_are_consumed() { @@ -174,13 +174,23 @@ mod tests { } } - #[test] + #[rstest] + #[case(16, 9)] + #[case(24, 8)] + #[case(32, 7)] + #[case(40, 6)] + #[case(56, 5)] + #[case(72, 5)] + #[case(96, 4)] + #[case(160, 4)] + #[case(256, 3)] + #[case(512, 3)] + #[case(2048, 2)] // Test collisions for all possible inputs of size n bits with m bits set - fn test_collisions_bits() { - let mut bytes = [0u8; 120]; - let bits_to_set = 2; + // Equivalent to SMHasher "Sparse" test + fn test_collisions_bits(#[case] size_bits: usize, #[case] bits_to_set: usize) { + let mut bytes = vec![0u8; size_bits / 8]; - let n = bytes.len() * 8; let mut digits: Vec = vec![0; bits_to_set]; for i in 0..bits_to_set { @@ -188,7 +198,7 @@ mod tests { } let mut i = 0; - let mut set = std::collections::HashSet::new(); + let mut set = ahash::AHashSet::new(); 'stop: loop { @@ -199,11 +209,7 @@ mod tests { } i += 1; - set.insert(gxhash0_64(&bytes, 0)); - // for &byte in bytes.iter() { - // print!("{:08b}", byte); - // } - // println!(); + set.insert(gxhash1_64(&bytes, 0)); // Reset bits for d in digits.iter() { @@ -213,16 +219,18 @@ mod tests { // Increment the rightmost digit for i in (0..bits_to_set).rev() { digits[i] += 1; - if digits[i] == n - bits_to_set + i + 1 { + if digits[i] == size_bits - bits_to_set + i + 1 { if i == 0 { break 'stop; } + // Reset digit. It will be set to an appropriate value after. digits[i] = 0; } else { break; } } + // Make sure digits are coherent for i in 1..bits_to_set { if digits[i] < digits[i - 1] { digits[i] = digits[i - 1] + 1; @@ -230,7 +238,7 @@ mod tests { } } - println!("count: {}, collisions: {}", i, i - set.len()); + println!("{}-bit keys with {} bits set. Combinations: {}, Collisions: {}", size_bits, bits_to_set, i, i - set.len()); assert_eq!(0, i - set.len(), "Collisions!"); } From b57b78b411999c3a7bad62d599afd8984dd753dd Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 17:51:59 +0200 Subject: [PATCH 3/9] Remove C implementation --- c/gxhash.c | 136 ----------------------------------------------------- c/gxhash.h | 9 ---- 2 files changed, 145 deletions(-) delete mode 100644 c/gxhash.c delete mode 100644 c/gxhash.h diff --git a/c/gxhash.c b/c/gxhash.c deleted file mode 100644 index 2cd0762..0000000 --- a/c/gxhash.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include "gxhash.h" - -typedef int8x16_t state; - -union ReinterpretUnion { - int64x2_t int64; - int32x4_t int32; - uint32x4_t uint32; - int8x16_t int8; - uint8x16_t uint8; -}; - -static inline state create_empty() { - return vdupq_n_s8(0); -} - -static inline void prefetch(const state* p) { - // __pld(p); // Uncomment if needed -} - -static inline state load_unaligned(const state* p) { - return vld1q_s8((const int8_t*)p); -} - -static inline state get_partial(const state* p, int len) { - static const int8_t MASK[32] = { - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - }; - int8x16_t mask = vld1q_s8(&MASK[16 - len]); - return vandq_s8(load_unaligned(p), mask); -} - -static inline uint8x16_t aes_encrypt(uint8x16_t data, uint8x16_t keys) { - uint8x16_t encrypted = vaeseq_u8(data, vdupq_n_u8(0)); - uint8x16_t mixed = vaesmcq_u8(encrypted); - return veorq_u8(mixed, keys); -} - -static inline uint8x16_t aes_encrypt_last(uint8x16_t data, uint8x16_t keys) { - uint8x16_t encrypted = vaeseq_u8(data, vdupq_n_u8(0)); - return veorq_u8(encrypted, keys); -} - -static inline state compress(state a, state b) { - union ReinterpretUnion au = { .int8 = a }; - union ReinterpretUnion bu = { .int8 = b }; - union ReinterpretUnion result = { .uint8 = aes_encrypt_last(au.uint8, bu.uint8) }; - return result.int8; -} - -static inline uint64_t finalize(state hash) { - static const uint32_t salt1_data[4] = {0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85}; - static const uint32_t salt2_data[4] = {0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F}; - static const uint32_t salt3_data[4] = {0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32}; - - uint32x4_t salt1 = vld1q_u32(salt1_data); - uint32x4_t salt2 = vld1q_u32(salt2_data); - uint32x4_t salt3 = vld1q_u32(salt3_data); - - union ReinterpretUnion hash_u = { .int8 = hash }; - hash_u.uint8 = aes_encrypt(hash_u.uint8, vreinterpretq_u8_u32(salt1)); - hash_u.uint8 = aes_encrypt(hash_u.uint8, vreinterpretq_u8_u32(salt2)); - hash_u.uint8 = aes_encrypt_last(hash_u.uint8, vreinterpretq_u8_u32(salt3)); - - return *(uint64_t*)&hash_u.int8; -} - -uint64_t gxhash(const uint8_t* input, int len) { - const int VECTOR_SIZE = sizeof(state); - const state* p = (const state*)input; - const state* v = p; - const state* end_address; - int remaining_blocks_count = len / VECTOR_SIZE; - state hash_vector = create_empty(); - - const int UNROLL_FACTOR = 8; - if (len >= VECTOR_SIZE * UNROLL_FACTOR) { - int unrollable_blocks_count = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR; - end_address = v + unrollable_blocks_count; - - state s0 = load_unaligned(v++); - state s1 = load_unaligned(v++); - state s2 = load_unaligned(v++); - state s3 = load_unaligned(v++); - state s4 = load_unaligned(v++); - state s5 = load_unaligned(v++); - state s6 = load_unaligned(v++); - state s7 = load_unaligned(v++); - - while (v < end_address) { - state v0 = load_unaligned(v++); - state v1 = load_unaligned(v++); - state v2 = load_unaligned(v++); - state v3 = load_unaligned(v++); - state v4 = load_unaligned(v++); - state v5 = load_unaligned(v++); - state v6 = load_unaligned(v++); - state v7 = load_unaligned(v++); - - prefetch(v); - - s0 = compress(s0, v0); - s1 = compress(s1, v1); - s2 = compress(s2, v2); - s3 = compress(s3, v3); - s4 = compress(s4, v4); - s5 = compress(s5, v5); - s6 = compress(s6, v6); - s7 = compress(s7, v7); - } - - state a = compress(compress(s0, s1), compress(s2, s3)); - state b = compress(compress(s4, s5), compress(s6, s7)); - hash_vector = compress(a, b); - - remaining_blocks_count -= unrollable_blocks_count; - } - - end_address = v + remaining_blocks_count; - - while (v < end_address) { - state v0 = load_unaligned(v++); - hash_vector = compress(hash_vector, v0); - } - - int remaining_bytes = len % VECTOR_SIZE; - if (remaining_bytes > 0) { - state partial_vector = get_partial(v, remaining_bytes); - hash_vector = compress(hash_vector, partial_vector); - } - - return finalize(hash_vector); -} diff --git a/c/gxhash.h b/c/gxhash.h deleted file mode 100644 index 10c28ea..0000000 --- a/c/gxhash.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef GXHASH_H -#define GXHASH_H - -#include - -// Function prototype for gxhash -uint64_t gxhash(const uint8_t* input, int len); - -#endif // GXHASH_H From c84191e677311b61b1f9e73a345dcdf3abfe5b63 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 18:00:49 +0200 Subject: [PATCH 4/9] Small cleanup --- src/gxhash/mod.rs | 49 +++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index 1a8d276..12f4304 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -1,8 +1,5 @@ -use std::intrinsics::likely; - mod platform; - -pub use platform::*; +use platform::*; #[inline(always)] // To be disabled when profiling pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 { @@ -51,16 +48,15 @@ unsafe fn compress(a: state, b: state) -> state { fn gxhash(input: &[u8], seed: i32) -> state { unsafe { let len: isize = input.len() as isize; + let ptr = input.as_ptr() as *const state; - let p = input.as_ptr() as *const i8; - let v = p as *const state; - - let hash_vector = if len <= 16 { - get_partial(v, len) - } else if len < 128 { - gxhash_process_1::(v, create_empty(), len) + // Lower sizes first, as comparison/branching overhead will become negligible as input size grows. + let hash_vector = if len <= VECTOR_SIZE { + gxhash_process_last::(ptr, create_empty(), len) + } else if len < VECTOR_SIZE * 8 { + gxhash_process_1::(ptr, create_empty(), len) } else { - gxhash_process_8::(v, create_empty(), len) + gxhash_process_8::(ptr, create_empty(), len) }; finalize(hash_vector, seed) @@ -78,20 +74,20 @@ macro_rules! load_unaligned { } #[inline(always)] -unsafe fn gxhash_process_8(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state { +unsafe fn gxhash_process_8(mut ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state { const UNROLL_FACTOR: isize = 8; let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; - let end_address = v.offset(unrollable_blocks_count as isize) as usize; + let end_address = ptr.offset(unrollable_blocks_count as isize) as usize; - load_unaligned!(v, s0, s1, s2, s3, s4, s5, s6, s7); + load_unaligned!(ptr, s0, s1, s2, s3, s4, s5, s6, s7); - while (v as usize) < end_address { + while (ptr as usize) < end_address { - load_unaligned!(v, v0, v1, v2, v3, v4, v5, v6, v7); + load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7); - prefetch(v); + prefetch(ptr); s0 = compress::(s0, v0); s1 = compress::(s1, v1); @@ -107,31 +103,30 @@ unsafe fn gxhash_process_8(mut v: *const state, hash_vector: sta let b = compress::(compress::(s4, s5), compress::(s6, s7)); let hash_vector = compress::(hash_vector, compress::(a, b)); - gxhash_process_1::(v, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE) + gxhash_process_1::(ptr, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE) } #[inline(always)] -unsafe fn gxhash_process_1(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state { +unsafe fn gxhash_process_1(mut ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state { - let end_address = v.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize; + let end_address = ptr.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize; let mut hash_vector = hash_vector; - while (v as usize) < end_address { - load_unaligned!(v, v0); + while (ptr as usize) < end_address { + load_unaligned!(ptr, v0); hash_vector = compress::(hash_vector, v0); } let remaining_bytes = remaining_bytes & (VECTOR_SIZE - 1); if remaining_bytes > 0 { - hash_vector = gxhash_process_last::(v, hash_vector, remaining_bytes); + hash_vector = gxhash_process_last::(ptr, hash_vector, remaining_bytes); } hash_vector } #[inline(always)] -unsafe fn gxhash_process_last(v: *const state, hash_vector: state, remaining_bytes: isize) -> state { - - let partial_vector = get_partial(v, remaining_bytes); +unsafe fn gxhash_process_last(ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state { + let partial_vector = get_partial(ptr, remaining_bytes); compress::(hash_vector, partial_vector) } From b0f866b046ab48e83f70f629ec5651bb4ce71c7d Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Mon, 23 Oct 2023 00:21:40 +0200 Subject: [PATCH 5/9] Go for tmp construction --- src/gxhash/mod.rs | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index 12f4304..509bf45 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -53,6 +53,8 @@ fn gxhash(input: &[u8], seed: i32) -> state { // Lower sizes first, as comparison/branching overhead will become negligible as input size grows. let hash_vector = if len <= VECTOR_SIZE { gxhash_process_last::(ptr, create_empty(), len) + } else if len <= VECTOR_SIZE * 2 { + gxhash_process_last::(ptr.offset(1), compress::(*ptr, create_empty()), len - VECTOR_SIZE) } else if len < VECTOR_SIZE * 8 { gxhash_process_1::(ptr, create_empty(), len) } else { @@ -80,28 +82,24 @@ unsafe fn gxhash_process_8(mut ptr: *const state, hash_vector: s let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; let end_address = ptr.offset(unrollable_blocks_count as isize) as usize; - - load_unaligned!(ptr, s0, s1, s2, s3, s4, s5, s6, s7); - + + let mut hash_vector = hash_vector; while (ptr as usize) < end_address { load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7); prefetch(ptr); - s0 = compress::(s0, v0); - s1 = compress::(s1, v1); - s2 = compress::(s2, v2); - s3 = compress::(s3, v3); - s4 = compress::(s4, v4); - s5 = compress::(s5, v5); - s6 = compress::(s6, v6); - s7 = compress::(s7, v7); - } + v0 = compress::<0>(v0, v1); + v0 = compress::<0>(v0, v2); + v0 = compress::<0>(v0, v3); + v0 = compress::<0>(v0, v4); + v0 = compress::<0>(v0, v5); + v0 = compress::<0>(v0, v6); + v0 = compress::<0>(v0, v7); - let a = compress::(compress::(s0, s1), compress::(s2, s3)); - let b = compress::(compress::(s4, s5), compress::(s6, s7)); - let hash_vector = compress::(hash_vector, compress::(a, b)); + hash_vector = compress::(hash_vector, v0); + } gxhash_process_1::(ptr, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE) } From 846a12bd5d3fd20e11c7ed5758c4a8dab4ab08bf Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 23:24:32 +0200 Subject: [PATCH 6/9] Improve blocks consumed test --- src/gxhash/mod.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index 509bf45..0897436 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -135,9 +135,20 @@ mod tests { use rand::Rng; use rstest::rstest; - #[test] - fn all_blocks_are_consumed() { - let mut bytes = [42u8; 1200]; + #[rstest] + #[case(4)] + #[case(16)] + #[case(24)] + #[case(32)] + #[case(56)] + #[case(72)] + #[case(96)] + #[case(160)] + #[case(256)] + #[case(512)] + #[case(1200)] + fn all_blocks_are_consumed(#[case] size_bits: usize) { + let mut bytes = vec![42u8; size_bits]; let ref_hash = gxhash0_32(&bytes, 0); From 7ccad8cffa8b55abee0b19d83d504fdad139d68f Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Wed, 25 Oct 2023 00:06:17 +0200 Subject: [PATCH 7/9] Add FFI dynamic library --- Cargo.toml | 3 +-- ffi/Cargo.toml | 12 ++++++++++++ ffi/src/lib.rs | 25 +++++++++++++++++++++++++ src/main.rs | 18 ------------------ 4 files changed, 38 insertions(+), 20 deletions(-) create mode 100644 ffi/Cargo.toml create mode 100644 ffi/src/lib.rs delete mode 100644 src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 604e391..78e5fdb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,9 @@ [package] name = "gxhash" +author = "Olivier Giniaux" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [profile.release] debug = true diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml new file mode 100644 index 0000000..d744be9 --- /dev/null +++ b/ffi/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "gxhash_ffi" +author = "Olivier Giniaux" +version = "0.1.0" +edition = "2021" + +[lib] +name = "gxhash" +crate-type = ["cdylib"] + +[dependencies] +gxhash = { path = "../", default-features = false } \ No newline at end of file diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs new file mode 100644 index 0000000..e271124 --- /dev/null +++ b/ffi/src/lib.rs @@ -0,0 +1,25 @@ +use core::slice; + +#[no_mangle] +pub unsafe extern "C" fn gxhash0_32(buf: *const (), len: usize, seed: i32) -> u32 { + let data: &[u8] = slice::from_raw_parts(buf as *const u8, len); + gxhash::gxhash0_32(data, seed) +} + +#[no_mangle] +pub unsafe extern "C" fn gxhash0_64(buf: *const (), len: usize, seed: i32) -> u64 { + let data: &[u8] = slice::from_raw_parts(buf as *const u8, len); + gxhash::gxhash0_64(data, seed) +} + +#[no_mangle] +pub unsafe extern "C" fn gxhash1_32(buf: *const (), len: usize, seed: i32) -> u32 { + let data: &[u8] = slice::from_raw_parts(buf as *const u8, len); + gxhash::gxhash1_32(data, seed) +} + +#[no_mangle] +pub unsafe extern "C" fn gxhash1_64(buf: *const (), len: usize, seed: i32) -> u64 { + let data: &[u8] = slice::from_raw_parts(buf as *const u8, len); + gxhash::gxhash1_64(data, seed) +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 26961b9..0000000 --- a/src/main.rs +++ /dev/null @@ -1,18 +0,0 @@ -use rand::Rng; - -use gxhash::*; - -fn main() { - - let mut rng = rand::thread_rng(); - let mut random_bytes = [0u8; 16384]; // Create an array of 16 bytes, initialized to 0 - rng.fill(&mut random_bytes[..]); // Fill the array with random bytes - - let mut sum: u32 = 0; - - for _ in 0..100_000_000 { - sum = sum.wrapping_add(gxhash0_32(&random_bytes, 0)); - } - - println!("{}", sum); -} \ No newline at end of file From a2e3aaab1a7cd851e4e375a3da4e8a38ee2d6936 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 29 Oct 2023 00:56:43 +0200 Subject: [PATCH 8/9] Add 128-bit gxhash for x86 --- .cargo/config | 2 + Cargo.toml | 8 ++- benches/fnv.rs | 14 ++++++ benches/throughput.rs | 25 ++++++---- ffi/Cargo.toml | 2 +- src/gxhash/mod.rs | 5 ++ src/gxhash/platform/mod.rs | 13 ++++- src/gxhash/platform/x86_128.rs | 91 ++++++++++++++++++++++++++++++++++ src/gxhash/platform/x86_256.rs | 17 ++++--- 9 files changed, 156 insertions(+), 21 deletions(-) create mode 100644 .cargo/config create mode 100644 benches/fnv.rs create mode 100644 src/gxhash/platform/x86_128.rs diff --git a/.cargo/config b/.cargo/config new file mode 100644 index 0000000..d5135e9 --- /dev/null +++ b/.cargo/config @@ -0,0 +1,2 @@ +[build] +rustflags = ["-C", "target-cpu=native"] \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 78e5fdb..4af710b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,8 +4,11 @@ author = "Olivier Giniaux" version = "0.1.0" edition = "2021" -[profile.release] -debug = true +[features] +# The 256-bit state GxHash is faster for large inputs than the default 128-bit state implementation. +# Please not however that the 256-bit GxHash and the 128-bit GxHash don't generate the same hashes for a same input. +# Requires AVX2 and VAES (X86). +256-bit = [] [dependencies] rand = "0.8" @@ -14,6 +17,7 @@ rand = "0.8" rstest = "0.18.2" criterion = { version = "0.5.1" } lazy_static = { version = "1.3" } +# Other hash algorithms, for comparison. ahash = "0.8.3" t1ha = "0.1.0" twox-hash = "1.6.3" diff --git a/benches/fnv.rs b/benches/fnv.rs new file mode 100644 index 0000000..cd4713c --- /dev/null +++ b/benches/fnv.rs @@ -0,0 +1,14 @@ +const INITIAL_STATE: u64 = 0xcbf29ce484222325; +const PRIME: u64 = 0x100000001b3; + +#[inline] +pub const fn fnv_hash(bytes: &[u8]) -> u64 { + let mut hash = INITIAL_STATE; + let mut i = 0; + while i < bytes.len() { + hash = hash ^ (bytes[i] as u64); + hash = hash.wrapping_mul(PRIME); + i += 1; + } + hash +} \ No newline at end of file diff --git a/benches/throughput.rs b/benches/throughput.rs index be5147a..acd3d0b 100644 --- a/benches/throughput.rs +++ b/benches/throughput.rs @@ -1,19 +1,19 @@ -#![feature(build_hasher_simple_hash_one)] - use std::time::Duration; use std::alloc::{alloc, dealloc, Layout}; use std::slice; -use criterion::measurement::{WallTime}; +use criterion::measurement::WallTime; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId}; -use gxhash::*; use rand::Rng; +use gxhash::*; +mod fnv; + fn benchmark(c: &mut BenchmarkGroup, data: &[u8], name: &str, delegate: F) where F: Fn(&[u8], i32) -> u64 { for i in 1..16 { - let len = usize::pow(2, i); + let len = usize::pow(4, i); c.throughput(Throughput::Bytes(len as u64)); @@ -45,9 +45,9 @@ fn benchmark_all(c: &mut Criterion) { group.plot_config(plot_config); // GxHash0 - benchmark(&mut group, slice, "gxhash0", |data: &[u8], _: i32| -> u64 { - gxhash0_64(data, 0) - }); + // benchmark(&mut group, slice, "gxhash0", |data: &[u8], _: i32| -> u64 { + // gxhash0_64(data, 0) + // }); // GxHash1 benchmark(&mut group, slice, "gxhash1", |data: &[u8], _: i32| -> u64 { @@ -55,9 +55,9 @@ fn benchmark_all(c: &mut Criterion) { }); // AHash - let build_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0); + let ahash_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0); benchmark(&mut group, slice, "ahash", |data: &[u8], _: i32| -> u64 { - build_hasher.hash_one(data) + ahash_hasher.hash_one(data) }); // T1ha0 @@ -70,6 +70,11 @@ fn benchmark_all(c: &mut Criterion) { twox_hash::xxh3::hash64_with_seed(data, seed as u64) }); + // FNV-1a + benchmark(&mut group, slice, "fnv-1a", |data: &[u8], _: i32| -> u64 { + fnv::fnv_hash(data) + }); + group.finish(); // Free benchmark data diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index d744be9..c0fdc90 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [lib] name = "gxhash" -crate-type = ["cdylib"] +crate-type = ["cdylib", "staticlib"] [dependencies] gxhash = { path = "../", default-features = false } \ No newline at end of file diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index 0897436..75d57b3 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -30,6 +30,11 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 { unsafe { let p = &gxhash::<1>(input, seed) as *const state as *const u64; *p + + // Alternative idea is to extract the center, to avoid xoring for 256 bit state + // let p = &gxhash::<1>(input, seed) as *const state as *const u8; + // let shifted_ptr = p.offset(3) as *const u64; + // *shifted_ptr } } diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs index dced5b6..230c2b2 100644 --- a/src/gxhash/platform/mod.rs +++ b/src/gxhash/platform/mod.rs @@ -2,8 +2,19 @@ #[path = "arm_128.rs"] pub mod platform; -#[cfg(target_arch = "x86_64")] +#[cfg(all( + feature = "256-bit", + target_arch = "x86_64", + target_feature = "avx2") +)] #[path = "x86_256.rs"] pub mod platform; +#[cfg(all( + not(feature = "256-bit"), + target_arch = "x86_64" +))] +#[path = "x86_128.rs"] +pub mod platform; + pub use platform::*; \ No newline at end of file diff --git a/src/gxhash/platform/x86_128.rs b/src/gxhash/platform/x86_128.rs new file mode 100644 index 0000000..d9f71dc --- /dev/null +++ b/src/gxhash/platform/x86_128.rs @@ -0,0 +1,91 @@ +use core::arch::x86_64::*; +use std::mem::size_of; + +pub type state = __m128i; + +#[inline] +pub unsafe fn create_empty() -> state { + _mm_setzero_si128() +} + +#[inline] +pub unsafe fn prefetch(p: *const state) { + _mm_prefetch(p as *const i8, 3); +} + +#[inline] +pub unsafe fn load_unaligned(p: *const state) -> state { + _mm_loadu_si128(p) +} + +#[inline] +pub unsafe fn get_partial(p: *const state, len: isize) -> state { + let partial_vector: state; + // Safety check + if check_same_page(p) { + let indices = _mm_setr_epi8( + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + ); + + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + partial_vector = _mm_and_si128(_mm_loadu_si128(p), mask); + } else { + partial_vector = get_partial_safe(p as *const u8, len as usize) + } + // Prevents padded zeroes to introduce bias + _mm_add_epi32(partial_vector, _mm_set1_epi32(len as i32)) +} + +#[inline] +unsafe fn check_same_page(ptr: *const state) -> bool { + let address = ptr as usize; + // Mask to keep only the last 12 bits (3 bytes) + let offset_within_page = address & 0xFFF; + // Check if the 32nd byte from the current offset exceeds the page boundary + offset_within_page <= (4096 - size_of::() - 1) +} + +#[inline] +unsafe fn get_partial_safe(data: *const u8, len: usize) -> state { + // Temporary buffer filled with zeros + let mut buffer = [0u8; size_of::()]; + // Copy data into the buffer + std::ptr::copy(data, buffer.as_mut_ptr(), len); + // Load the buffer into a __m256i vector + _mm_loadu_si128(buffer.as_ptr() as *const state) +} + +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn compress_1(a: state, b: state) -> state { + let keys_1 = _mm_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542); + let keys_2 = _mm_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9); + + // 2+1 rounds of AES for compression + let mut b = _mm_aesenc_si128(b, keys_1); + b = _mm_aesenc_si128(b, keys_2); + return _mm_aesenclast_si128(a, b); +} + +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn compress_0(a: state, b: state) -> state { + return _mm_aesenc_si128(a, b); +} + +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn finalize(hash: state, seed: i32) -> state { + // Hardcoded AES keys + let keys_1 = _mm_set_epi32(0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85); + let keys_2 = _mm_set_epi32(0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F); + let keys_3 = _mm_set_epi32(0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32); + + // 4 rounds of AES + let mut hash = _mm_aesenc_si128(hash, _mm_set1_epi32(seed)); + hash = _mm_aesenc_si128(hash, keys_1); + hash = _mm_aesenc_si128(hash, keys_2); + hash = _mm_aesenclast_si128(hash, keys_3); + + hash +} \ No newline at end of file diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs index 2f557f2..c5ae5ac 100644 --- a/src/gxhash/platform/x86_256.rs +++ b/src/gxhash/platform/x86_256.rs @@ -10,7 +10,7 @@ pub unsafe fn create_empty() -> state { #[inline] pub unsafe fn prefetch(p: *const state) { - _mm_prefetch(p as *const i8, 3); + //_mm_prefetch(p as *const i8, 3); } #[inline] @@ -24,10 +24,7 @@ pub unsafe fn get_partial(p: *const state, len: isize) -> state { // Safety check if check_same_page(p) { let indices = _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); @@ -36,7 +33,7 @@ pub unsafe fn get_partial(p: *const state, len: isize) -> state { partial_vector = get_partial_safe(p as *const u8, len as usize) } // Prevents padded zeroes to introduce bias - _mm256_add_epi32(partial_vector, _mm256_set1_epi32(len as i32)) + _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) } #[inline] @@ -60,7 +57,7 @@ unsafe fn get_partial_safe(data: *const u8, len: usize) -> state { #[inline] #[allow(overflowing_literals)] -pub unsafe fn compress(a: state, b: state) -> state { +pub unsafe fn compress_1(a: state, b: state) -> state { let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); let keys_2 = _mm256_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9, 0x7A83D76B, 0xB1E8F9F0, 0x028925A8, 0x3B9A4E71); @@ -70,6 +67,12 @@ pub unsafe fn compress(a: state, b: state) -> state { return _mm256_aesenclast_epi128(a, b); } +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn compress_0(a: state, b: state) -> state { + return _mm256_aesenc_epi128(a, b); +} + #[inline] #[allow(overflowing_literals)] pub unsafe fn finalize(hash: state, seed: i32) -> state { From 5032fa439205f84daa9ae99d6e24951f8dc07512 Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 29 Oct 2023 01:08:41 +0200 Subject: [PATCH 9/9] Add highwayhash to benchmark --- Cargo.toml | 1 + benches/throughput.rs | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4af710b..42ac185 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ lazy_static = { version = "1.3" } ahash = "0.8.3" t1ha = "0.1.0" twox-hash = "1.6.3" +highway = "1.1.0" [[bench]] name = "throughput" diff --git a/benches/throughput.rs b/benches/throughput.rs index acd3d0b..48f9609 100644 --- a/benches/throughput.rs +++ b/benches/throughput.rs @@ -12,20 +12,19 @@ mod fnv; fn benchmark(c: &mut BenchmarkGroup, data: &[u8], name: &str, delegate: F) where F: Fn(&[u8], i32) -> u64 { - for i in 1..16 { + for i in 1.. { let len = usize::pow(4, i); + if len > data.len() { + break; + } c.throughput(Throughput::Bytes(len as u64)); - let aligned_slice = &data[0..len]; - c.bench_with_input(BenchmarkId::new(name, len), aligned_slice, |bencher, input| { + let slice = &data[0..len]; // Aligned + // let slice = &data[1..len]; // Unaligned + c.bench_with_input(BenchmarkId::new(name, len), slice, |bencher, input| { bencher.iter(|| black_box(delegate(input, 0))) }); - - // let unaligned_slice = &slice[1..len]; - // group.bench_with_input(format!("{} bytes (unaligned)", len), unaligned_slice, |bencher, input| { - // bencher.iter(|| black_box(gxhash(input))) - // }); } } @@ -50,7 +49,7 @@ fn benchmark_all(c: &mut Criterion) { // }); // GxHash1 - benchmark(&mut group, slice, "gxhash1", |data: &[u8], _: i32| -> u64 { + benchmark(&mut group, slice, "gxhash", |data: &[u8], _: i32| -> u64 { gxhash1_64(data, 0) }); @@ -70,6 +69,12 @@ fn benchmark_all(c: &mut Criterion) { twox_hash::xxh3::hash64_with_seed(data, seed as u64) }); + // HighwayHash + benchmark(&mut group, slice, "highwayhash", |data: &[u8], _: i32| -> u64 { + use highway::{HighwayHasher, HighwayHash}; + HighwayHasher::default().hash64(data) + }); + // FNV-1a benchmark(&mut group, slice, "fnv-1a", |data: &[u8], _: i32| -> u64 { fnv::fnv_hash(data)