From e270f6fcd8602a4380f1d9a09f59da8bbf59cbfc Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Sun, 22 Oct 2023 16:48:14 +0200
Subject: [PATCH 1/9] Go for inline fallthrough

---
 src/gxhash/mod.rs | 154 +++++++++++++++++++++++++---------------------
 1 file changed, 84 insertions(+), 70 deletions(-)
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index c0ac0c8..c51aaab 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -4,7 +4,7 @@ mod platform;
 
 pub use platform::*;
 
-#[inline] // To be disabled when profiling
+#[inline(always)] // To be disabled when profiling
 pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 {
     unsafe {
         let p = &gxhash::<0>(input, seed) as *const state as *const u32;
@@ -12,7 +12,7 @@ pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 {
     }
 }
 
-#[inline] // To be disabled when profiling
+#[inline(always)] // To be disabled when profiling
 pub fn gxhash0_64(input: &[u8], seed: i32) -> u64 {
     unsafe {
         let p = &gxhash::<0>(input, seed) as *const state as *const u64;
@@ -20,7 +20,7 @@ pub fn gxhash0_64(input: &[u8], seed: i32) -> u64 {
     }
 }
 
-#[inline] // To be disabled when profiling
+#[inline(always)] // To be disabled when profiling
 pub fn gxhash1_32(input: &[u8], seed: i32) -> u32 {
     unsafe {
         let p = &gxhash::<1>(input, seed) as *const state as *const u32;
@@ -28,7 +28,7 @@ pub fn gxhash1_32(input: &[u8], seed: i32) -> u32 {
     }
 }
 
-#[inline] // To be disabled when profiling
+#[inline(always)] // To be disabled when profiling
 pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 {
     unsafe {
         let p = &gxhash::<1>(input, seed) as *const state as *const u64;
@@ -36,90 +36,104 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 {
     }
 }
 
-#[inline]
+const VECTOR_SIZE: isize = std::mem::size_of::<state>() as isize;
+
+
+#[inline(always)]
+unsafe fn compress<const N: usize>(a: state, b: state) -> state {
+    match N {
+        0 => compress_0(a, b),
+        1 => compress_1(a, b),
+        _ => compress_1(a, b)
+    }
+}
+
+#[inline(always)]
 fn gxhash<const N: usize>(input: &[u8], seed: i32) -> state {
     unsafe {
-        const VECTOR_SIZE: isize = std::mem::size_of::<state>() as isize;
-        
         let len: isize = input.len() as isize;
 
         let p = input.as_ptr() as *const i8;
-        let mut v = p as *const state;
+        let v = p as *const state;
+
+        let hash_vector = if len <= 16 {
+            get_partial(v, len)
+        } else if len < 128 {
+            gxhash_process_1::<N>(v, create_empty(), len)
+        } else {
+            gxhash_process_8::<N>(v, create_empty(), len)
+        };
 
-        // Quick exit
-        if len <= VECTOR_SIZE {
-            let partial_vector = get_partial(v, len);
-            return finalize(partial_vector, seed);
-        }
+        finalize(hash_vector, seed)
+    }
+}
 
-        let mut end_address: usize;
-        let mut remaining_blocks_count: isize = len / VECTOR_SIZE;
-        let mut hash_vector: state = create_empty();
+macro_rules! load_unaligned {
+    ($ptr:ident, $($var:ident),+) => {
+        $(
+            #[allow(unused_mut)]
+            let mut $var = load_unaligned($ptr);
+            $ptr = $ptr.offset(1);
+        )+
+    };
+}
 
-        // Choose compression function depending on version.
-        // Lower is faster, higher is more collision resistant.
-        let c = match N {
-            0 => compress_0,
-            1 => compress_1,
-            _ => compress_1
-        };
+#[inline(always)]
+unsafe fn gxhash_process_8<const N: usize>(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
 
-        macro_rules! load_unaligned {
-            ($($var:ident),+) => {
-                $(
-                    #[allow(unused_mut)]
-                    let mut $var = load_unaligned(v);
-                    v = v.offset(1);
-                )+
-            };
-        }
+    const UNROLL_FACTOR: isize = 8;
 
-        const UNROLL_FACTOR: isize = 8;
-        if len >= VECTOR_SIZE * UNROLL_FACTOR {
+    let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; 
+    let end_address = v.offset(unrollable_blocks_count as isize) as usize;
 
-            let unrollable_blocks_count: isize = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR; 
-            end_address = v.offset(unrollable_blocks_count) as usize;
-    
-            load_unaligned!(s0, s1, s2, s3, s4, s5, s6, s7);
- 
-            while (v as usize) < end_address {
-                
-                load_unaligned!(v0, v1, v2, v3, v4, v5, v6, v7);
-
-                prefetch(v);
-
-                s0 = c(s0, v0);
-                s1 = c(s1, v1);
-                s2 = c(s2, v2);
-                s3 = c(s3, v3);
-                s4 = c(s4, v4);
-                s5 = c(s5, v5);
-                s6 = c(s6, v6);
-                s7 = c(s7, v7);
-            }
+    load_unaligned!(v, s0, s1, s2, s3, s4, s5, s6, s7);
+
+    while (v as usize) < end_address {
         
-            let a = c(c(s0, s1), c(s2, s3));
-            let b = c(c(s4, s5), c(s6, s7));
-            hash_vector = c(a, b);
+        load_unaligned!(v, v0, v1, v2, v3, v4, v5, v6, v7);
+
+        prefetch(v);
+
+        s0 = compress::<N>(s0, v0);
+        s1 = compress::<N>(s1, v1);
+        s2 = compress::<N>(s2, v2);
+        s3 = compress::<N>(s3, v3);
+        s4 = compress::<N>(s4, v4);
+        s5 = compress::<N>(s5, v5);
+        s6 = compress::<N>(s6, v6);
+        s7 = compress::<N>(s7, v7);
+    }
 
-            remaining_blocks_count -= unrollable_blocks_count;
-        }
+    let a = compress::<N>(compress::<N>(s0, s1), compress::<N>(s2, s3));
+    let b = compress::<N>(compress::<N>(s4, s5), compress::<N>(s6, s7));
+    let hash_vector = compress::<N>(hash_vector, compress::<N>(a, b));
 
-        end_address = v.offset(remaining_blocks_count) as usize;
+    gxhash_process_1::<N>(v, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE)
+}
 
-        while likely((v as usize) < end_address) {
-            load_unaligned!(v0);
-            hash_vector = c(hash_vector, v0);
-        }
+#[inline(always)]
+unsafe fn gxhash_process_1<const N: usize>(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
+    
+    let end_address = v.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize;
 
-        let remaining_bytes = len & (VECTOR_SIZE - 1);
-        if likely(remaining_bytes > 0) {
-            let partial_vector = get_partial(v, remaining_bytes);
-            hash_vector = c(hash_vector, partial_vector);
-        }
+    let mut hash_vector = hash_vector;
+    while (v as usize) < end_address {
+        load_unaligned!(v, v0);
+        hash_vector = compress::<N>(hash_vector, v0);
+    }
 
-        finalize(hash_vector, seed)
+    let remaining_bytes = remaining_bytes & (VECTOR_SIZE - 1);
+    if remaining_bytes > 0 {
+        hash_vector = gxhash_process_last::<N>(v, hash_vector, remaining_bytes);
     }
+    hash_vector
+}
+
+#[inline(always)]
+unsafe fn gxhash_process_last<const N: usize>(v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
+
+    let partial_vector = get_partial(v, remaining_bytes);
+    compress::<N>(hash_vector, partial_vector)
 }
 
 #[cfg(test)]

From 375f205dc59417442404fcc2db3f379d15253921 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Sun, 22 Oct 2023 17:51:35 +0200
Subject: [PATCH 2/9] Parametrize sparse collision test with rstest crate

---
 Cargo.toml        |  1 +
 src/gxhash/mod.rs | 36 ++++++++++++++++++++++--------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ba71849..604e391 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ debug = true
 rand = "0.8"
 
 [dev-dependencies]
+rstest = "0.18.2"
 criterion = { version = "0.5.1" }
 lazy_static = { version = "1.3" }
 ahash = "0.8.3"
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index c51aaab..1a8d276 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -38,7 +38,6 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 {
 
 const VECTOR_SIZE: isize = std::mem::size_of::<state>() as isize;
 
-
 #[inline(always)]
 unsafe fn compress<const N: usize>(a: state, b: state) -> state {
     match N {
@@ -141,6 +140,7 @@ mod tests {
 
     use super::*;
     use rand::Rng;
+    use rstest::rstest;
 
     #[test]
     fn all_blocks_are_consumed() {
@@ -174,13 +174,23 @@ mod tests {
         }
     }
 
-    #[test]
+    #[rstest]
+    #[case(16, 9)]
+    #[case(24, 8)]
+    #[case(32, 7)]
+    #[case(40, 6)]
+    #[case(56, 5)]
+    #[case(72, 5)]
+    #[case(96, 4)]
+    #[case(160, 4)]
+    #[case(256, 3)]
+    #[case(512, 3)]
+    #[case(2048, 2)]
     // Test collisions for all possible inputs of size n bits with m bits set
-    fn test_collisions_bits() {
-        let mut bytes = [0u8; 120];
-        let bits_to_set = 2;
+    // Equivalent to SMHasher "Sparse" test
+    fn test_collisions_bits(#[case] size_bits: usize, #[case] bits_to_set: usize) {
+        let mut bytes = vec![0u8; size_bits / 8];
 
-        let n = bytes.len() * 8;
         let mut digits: Vec<usize> = vec![0; bits_to_set];
 
         for i in 0..bits_to_set {
@@ -188,7 +198,7 @@ mod tests {
         }
 
         let mut i = 0;
-        let mut set = std::collections::HashSet::new();
+        let mut set = ahash::AHashSet::new();
     
         'stop: loop {
 
@@ -199,11 +209,7 @@ mod tests {
             }
 
             i += 1;
-            set.insert(gxhash0_64(&bytes, 0));
-            // for &byte in bytes.iter() {
-            //     print!("{:08b}", byte);
-            // }
-            // println!();
+            set.insert(gxhash1_64(&bytes, 0));
 
             // Reset bits
             for d in digits.iter() {
@@ -213,16 +219,18 @@ mod tests {
             // Increment the rightmost digit
             for i in (0..bits_to_set).rev() {
                 digits[i] += 1;
-                if digits[i] == n - bits_to_set + i + 1 {
+                if digits[i] == size_bits - bits_to_set + i + 1 {
                     if i == 0 {
                         break 'stop;
                     }
+                    // Reset digit. It will be set to an appropriate value after.
                     digits[i] = 0;
                 } else {
                     break;
                 }
             }
 
+            // Make sure digits are coherent
             for i in 1..bits_to_set {
                 if digits[i] < digits[i - 1] {
                     digits[i] = digits[i - 1] + 1;
@@ -230,7 +238,7 @@ mod tests {
             }
         }
 
-        println!("count: {}, collisions: {}", i, i - set.len());
+        println!("{}-bit keys with {} bits set. Combinations: {}, Collisions: {}", size_bits, bits_to_set, i, i - set.len());
 
         assert_eq!(0, i - set.len(), "Collisions!");
     }

From b57b78b411999c3a7bad62d599afd8984dd753dd Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Sun, 22 Oct 2023 17:51:59 +0200
Subject: [PATCH 3/9] Remove C implementation

---
 c/gxhash.c | 136 -----------------------------------------------------
 c/gxhash.h |   9 ----
 2 files changed, 145 deletions(-)
 delete mode 100644 c/gxhash.c
 delete mode 100644 c/gxhash.h

diff --git a/c/gxhash.c b/c/gxhash.c
deleted file mode 100644
index 2cd0762..0000000
--- a/c/gxhash.c
+++ /dev/null
@@ -1,136 +0,0 @@
-#include <stdint.h>
-#include <arm_neon.h>
-#include "gxhash.h"
-
-typedef int8x16_t state;
-
-union ReinterpretUnion {
-    int64x2_t int64;
-    int32x4_t int32;
-    uint32x4_t uint32;
-    int8x16_t int8;
-    uint8x16_t uint8;
-};
-
-static inline state create_empty() {
-    return vdupq_n_s8(0);
-}
-
-static inline void prefetch(const state* p) {
-    // __pld(p); // Uncomment if needed
-}
-
-static inline state load_unaligned(const state* p) {
-    return vld1q_s8((const int8_t*)p);
-}
-
-static inline state get_partial(const state* p, int len) {
-    static const int8_t MASK[32] = {
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-    };
-    int8x16_t mask = vld1q_s8(&MASK[16 - len]);
-    return vandq_s8(load_unaligned(p), mask);
-}
-
-static inline uint8x16_t aes_encrypt(uint8x16_t data, uint8x16_t keys) {
-    uint8x16_t encrypted = vaeseq_u8(data, vdupq_n_u8(0));
-    uint8x16_t mixed = vaesmcq_u8(encrypted);
-    return veorq_u8(mixed, keys);
-}
-
-static inline uint8x16_t aes_encrypt_last(uint8x16_t data, uint8x16_t keys) {
-    uint8x16_t encrypted = vaeseq_u8(data, vdupq_n_u8(0));
-    return veorq_u8(encrypted, keys);
-}
-
-static inline state compress(state a, state b) {
-    union ReinterpretUnion au = { .int8 = a };
-    union ReinterpretUnion bu = { .int8 = b };
-    union ReinterpretUnion result = { .uint8 = aes_encrypt_last(au.uint8, bu.uint8) };
-    return result.int8;
-}
-
-static inline uint64_t finalize(state hash) {
-    static const uint32_t salt1_data[4] = {0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85};
-    static const uint32_t salt2_data[4] = {0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F};
-    static const uint32_t salt3_data[4] = {0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32};
-
-    uint32x4_t salt1 = vld1q_u32(salt1_data);
-    uint32x4_t salt2 = vld1q_u32(salt2_data);
-    uint32x4_t salt3 = vld1q_u32(salt3_data);
-
-    union ReinterpretUnion hash_u = { .int8 = hash };
-    hash_u.uint8 = aes_encrypt(hash_u.uint8, vreinterpretq_u8_u32(salt1));
-    hash_u.uint8 = aes_encrypt(hash_u.uint8, vreinterpretq_u8_u32(salt2));
-    hash_u.uint8 = aes_encrypt_last(hash_u.uint8, vreinterpretq_u8_u32(salt3));
-
-    return *(uint64_t*)&hash_u.int8;
-}
-
-uint64_t gxhash(const uint8_t* input, int len) {
-    const int VECTOR_SIZE = sizeof(state);
-    const state* p = (const state*)input;
-    const state* v = p;
-    const state* end_address;
-    int remaining_blocks_count = len / VECTOR_SIZE;
-    state hash_vector = create_empty();
-
-    const int UNROLL_FACTOR = 8;
-    if (len >= VECTOR_SIZE * UNROLL_FACTOR) {
-        int unrollable_blocks_count = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR;
-        end_address = v + unrollable_blocks_count;
-
-        state s0 = load_unaligned(v++);
-        state s1 = load_unaligned(v++);
-        state s2 = load_unaligned(v++);
-        state s3 = load_unaligned(v++);
-        state s4 = load_unaligned(v++);
-        state s5 = load_unaligned(v++);
-        state s6 = load_unaligned(v++);
-        state s7 = load_unaligned(v++);
-
-        while (v < end_address) {
-            state v0 = load_unaligned(v++);
-            state v1 = load_unaligned(v++);
-            state v2 = load_unaligned(v++);
-            state v3 = load_unaligned(v++);
-            state v4 = load_unaligned(v++);
-            state v5 = load_unaligned(v++);
-            state v6 = load_unaligned(v++);
-            state v7 = load_unaligned(v++);
-
-            prefetch(v);
-
-            s0 = compress(s0, v0);
-            s1 = compress(s1, v1);
-            s2 = compress(s2, v2);
-            s3 = compress(s3, v3);
-            s4 = compress(s4, v4);
-            s5 = compress(s5, v5);
-            s6 = compress(s6, v6);
-            s7 = compress(s7, v7);
-        }
-
-        state a = compress(compress(s0, s1), compress(s2, s3));
-        state b = compress(compress(s4, s5), compress(s6, s7));
-        hash_vector = compress(a, b);
-
-        remaining_blocks_count -= unrollable_blocks_count;
-    }
-
-    end_address = v + remaining_blocks_count;
-
-    while (v < end_address) {
-        state v0 = load_unaligned(v++);
-        hash_vector = compress(hash_vector, v0);
-    }
-
-    int remaining_bytes = len % VECTOR_SIZE;
-    if (remaining_bytes > 0) {
-        state partial_vector = get_partial(v, remaining_bytes);
-        hash_vector = compress(hash_vector, partial_vector);
-    }
-
-    return finalize(hash_vector);
-}
diff --git a/c/gxhash.h b/c/gxhash.h
deleted file mode 100644
index 10c28ea..0000000
--- a/c/gxhash.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef GXHASH_H
-#define GXHASH_H
-
-#include <stdint.h>
-
-// Function prototype for gxhash
-uint64_t gxhash(const uint8_t* input, int len);
-
-#endif // GXHASH_H

From c84191e677311b61b1f9e73a345dcdf3abfe5b63 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Sun, 22 Oct 2023 18:00:49 +0200
Subject: [PATCH 4/9] Small cleanup

---
 src/gxhash/mod.rs | 49 +++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 1a8d276..12f4304 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -1,8 +1,5 @@
-use std::intrinsics::likely;
-
 mod platform;
-
-pub use platform::*;
+use platform::*;
 
 #[inline(always)] // To be disabled when profiling
 pub fn gxhash0_32(input: &[u8], seed: i32) -> u32 {
@@ -51,16 +48,15 @@ unsafe fn compress<const N: usize>(a: state, b: state) -> state {
 fn gxhash<const N: usize>(input: &[u8], seed: i32) -> state {
     unsafe {
         let len: isize = input.len() as isize;
+        let ptr = input.as_ptr() as *const state;
 
-        let p = input.as_ptr() as *const i8;
-        let v = p as *const state;
-
-        let hash_vector = if len <= 16 {
-            get_partial(v, len)
-        } else if len < 128 {
-            gxhash_process_1::<N>(v, create_empty(), len)
+        // Lower sizes first, as comparison/branching overhead will become negligible as input size grows.
+        let hash_vector = if len <= VECTOR_SIZE {
+            gxhash_process_last::<N>(ptr, create_empty(), len)
+        } else if len < VECTOR_SIZE * 8 {
+            gxhash_process_1::<N>(ptr, create_empty(), len)
         } else {
-            gxhash_process_8::<N>(v, create_empty(), len)
+            gxhash_process_8::<N>(ptr, create_empty(), len)
         };
 
         finalize(hash_vector, seed)
@@ -78,20 +74,20 @@ macro_rules! load_unaligned {
 }
 
 #[inline(always)]
-unsafe fn gxhash_process_8<const N: usize>(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
+unsafe fn gxhash_process_8<const N: usize>(mut ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state {
 
     const UNROLL_FACTOR: isize = 8;
 
     let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; 
-    let end_address = v.offset(unrollable_blocks_count as isize) as usize;
+    let end_address = ptr.offset(unrollable_blocks_count as isize) as usize;
 
-    load_unaligned!(v, s0, s1, s2, s3, s4, s5, s6, s7);
+    load_unaligned!(ptr, s0, s1, s2, s3, s4, s5, s6, s7);
 
-    while (v as usize) < end_address {
+    while (ptr as usize) < end_address {
         
-        load_unaligned!(v, v0, v1, v2, v3, v4, v5, v6, v7);
+        load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7);
 
-        prefetch(v);
+        prefetch(ptr);
 
         s0 = compress::<N>(s0, v0);
         s1 = compress::<N>(s1, v1);
@@ -107,31 +103,30 @@ unsafe fn gxhash_process_8<const N: usize>(mut v: *const state, hash_vector: sta
     let b = compress::<N>(compress::<N>(s4, s5), compress::<N>(s6, s7));
     let hash_vector = compress::<N>(hash_vector, compress::<N>(a, b));
 
-    gxhash_process_1::<N>(v, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE)
+    gxhash_process_1::<N>(ptr, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE)
 }
 
 #[inline(always)]
-unsafe fn gxhash_process_1<const N: usize>(mut v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
+unsafe fn gxhash_process_1<const N: usize>(mut ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state {
     
-    let end_address = v.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize;
+    let end_address = ptr.offset((remaining_bytes / VECTOR_SIZE) as isize) as usize;
 
     let mut hash_vector = hash_vector;
-    while (v as usize) < end_address {
-        load_unaligned!(v, v0);
+    while (ptr as usize) < end_address {
+        load_unaligned!(ptr, v0);
         hash_vector = compress::<N>(hash_vector, v0);
     }
 
     let remaining_bytes = remaining_bytes & (VECTOR_SIZE - 1);
     if remaining_bytes > 0 {
-        hash_vector = gxhash_process_last::<N>(v, hash_vector, remaining_bytes);
+        hash_vector = gxhash_process_last::<N>(ptr, hash_vector, remaining_bytes);
     }
     hash_vector
 }
 
 #[inline(always)]
-unsafe fn gxhash_process_last<const N: usize>(v: *const state, hash_vector: state, remaining_bytes: isize) -> state {
-
-    let partial_vector = get_partial(v, remaining_bytes);
+unsafe fn gxhash_process_last<const N: usize>(ptr: *const state, hash_vector: state, remaining_bytes: isize) -> state {
+    let partial_vector = get_partial(ptr, remaining_bytes);
     compress::<N>(hash_vector, partial_vector)
 }
 

From b0f866b046ab48e83f70f629ec5651bb4ce71c7d Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Mon, 23 Oct 2023 00:21:40 +0200
Subject: [PATCH 5/9] Go for tmp construction

---
 src/gxhash/mod.rs | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 12f4304..509bf45 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -53,6 +53,8 @@ fn gxhash<const N: usize>(input: &[u8], seed: i32) -> state {
         // Lower sizes first, as comparison/branching overhead will become negligible as input size grows.
         let hash_vector = if len <= VECTOR_SIZE {
             gxhash_process_last::<N>(ptr, create_empty(), len)
+        } else if len <= VECTOR_SIZE * 2 {
+            gxhash_process_last::<N>(ptr.offset(1), compress::<N>(*ptr, create_empty()), len - VECTOR_SIZE)
         } else if len < VECTOR_SIZE * 8 {
             gxhash_process_1::<N>(ptr, create_empty(), len)
         } else {
@@ -80,28 +82,24 @@ unsafe fn gxhash_process_8<const N: usize>(mut ptr: *const state, hash_vector: s
 
     let unrollable_blocks_count: isize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; 
     let end_address = ptr.offset(unrollable_blocks_count as isize) as usize;
-
-    load_unaligned!(ptr, s0, s1, s2, s3, s4, s5, s6, s7);
-
+    
+    let mut hash_vector = hash_vector;
     while (ptr as usize) < end_address {
         
         load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7);
 
         prefetch(ptr);
 
-        s0 = compress::<N>(s0, v0);
-        s1 = compress::<N>(s1, v1);
-        s2 = compress::<N>(s2, v2);
-        s3 = compress::<N>(s3, v3);
-        s4 = compress::<N>(s4, v4);
-        s5 = compress::<N>(s5, v5);
-        s6 = compress::<N>(s6, v6);
-        s7 = compress::<N>(s7, v7);
-    }
+        v0 = compress::<0>(v0, v1);
+        v0 = compress::<0>(v0, v2);
+        v0 = compress::<0>(v0, v3);
+        v0 = compress::<0>(v0, v4);
+        v0 = compress::<0>(v0, v5);
+        v0 = compress::<0>(v0, v6);
+        v0 = compress::<0>(v0, v7);
 
-    let a = compress::<N>(compress::<N>(s0, s1), compress::<N>(s2, s3));
-    let b = compress::<N>(compress::<N>(s4, s5), compress::<N>(s6, s7));
-    let hash_vector = compress::<N>(hash_vector, compress::<N>(a, b));
+        hash_vector = compress::<N>(hash_vector, v0);
+    }
 
     gxhash_process_1::<N>(ptr, hash_vector, remaining_bytes - unrollable_blocks_count * VECTOR_SIZE)
 }

From 846a12bd5d3fd20e11c7ed5758c4a8dab4ab08bf Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Sun, 22 Oct 2023 23:24:32 +0200
Subject: [PATCH 6/9] Improve blocks consumed test

---
 src/gxhash/mod.rs | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 509bf45..0897436 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -135,9 +135,20 @@ mod tests {
     use rand::Rng;
     use rstest::rstest;
 
-    #[test]
-    fn all_blocks_are_consumed() {
-        let mut bytes = [42u8; 1200];
+    #[rstest]
+    #[case(4)]
+    #[case(16)]
+    #[case(24)]
+    #[case(32)]
+    #[case(56)]
+    #[case(72)]
+    #[case(96)]
+    #[case(160)]
+    #[case(256)]
+    #[case(512)]
+    #[case(1200)]
+    fn all_blocks_are_consumed(#[case] size_bits: usize) {
+        let mut bytes = vec![42u8; size_bits];
 
         let ref_hash = gxhash0_32(&bytes, 0);
 

From 7ccad8cffa8b55abee0b19d83d504fdad139d68f Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@smartadserver.com>
Date: Wed, 25 Oct 2023 00:06:17 +0200
Subject: [PATCH 7/9] Add FFI dynamic library

---
 Cargo.toml     |  3 +--
 ffi/Cargo.toml | 12 ++++++++++++
 ffi/src/lib.rs | 25 +++++++++++++++++++++++++
 src/main.rs    | 18 ------------------
 4 files changed, 38 insertions(+), 20 deletions(-)
 create mode 100644 ffi/Cargo.toml
 create mode 100644 ffi/src/lib.rs
 delete mode 100644 src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index 604e391..78e5fdb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,9 @@
 [package]
 name = "gxhash"
+author = "Olivier Giniaux"
 version = "0.1.0"
 edition = "2021"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
 [profile.release]
 debug = true
 
diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml
new file mode 100644
index 0000000..d744be9
--- /dev/null
+++ b/ffi/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "gxhash_ffi"
+author = "Olivier Giniaux"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+name = "gxhash"
+crate-type = ["cdylib"]
+
+[dependencies]
+gxhash = { path = "../", default-features = false }
\ No newline at end of file
diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs
new file mode 100644
index 0000000..e271124
--- /dev/null
+++ b/ffi/src/lib.rs
@@ -0,0 +1,25 @@
+use core::slice;
+
+#[no_mangle]
+pub unsafe extern "C" fn gxhash0_32(buf: *const (), len: usize, seed: i32) -> u32 {
+    let data: &[u8] = slice::from_raw_parts(buf as *const u8, len);
+    gxhash::gxhash0_32(data, seed)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn gxhash0_64(buf: *const (), len: usize, seed: i32) -> u64 {
+    let data: &[u8] = slice::from_raw_parts(buf as *const u8, len);
+    gxhash::gxhash0_64(data, seed)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn gxhash1_32(buf: *const (), len: usize, seed: i32) -> u32 {
+    let data: &[u8] = slice::from_raw_parts(buf as *const u8, len);
+    gxhash::gxhash1_32(data, seed)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn gxhash1_64(buf: *const (), len: usize, seed: i32) -> u64 {
+    let data: &[u8] = slice::from_raw_parts(buf as *const u8, len);
+    gxhash::gxhash1_64(data, seed)
+}
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
deleted file mode 100644
index 26961b9..0000000
--- a/src/main.rs
+++ /dev/null
@@ -1,18 +0,0 @@
-use rand::Rng;
-
-use gxhash::*;
-
-fn main() {
-
-    let mut rng = rand::thread_rng();
-    let mut random_bytes = [0u8; 16384]; // Create an array of 16 bytes, initialized to 0
-    rng.fill(&mut random_bytes[..]); // Fill the array with random bytes
-
-    let mut sum: u32 = 0;
-
-    for _ in 0..100_000_000 {
-        sum = sum.wrapping_add(gxhash0_32(&random_bytes, 0));
-    }
-
-    println!("{}", sum);
-}
\ No newline at end of file

From a2e3aaab1a7cd851e4e375a3da4e8a38ee2d6936 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@gmail.com>
Date: Sun, 29 Oct 2023 00:56:43 +0200
Subject: [PATCH 8/9] Add 128-bit gxhash for x86

---
 .cargo/config                  |  2 +
 Cargo.toml                     |  8 ++-
 benches/fnv.rs                 | 14 ++++++
 benches/throughput.rs          | 25 ++++++----
 ffi/Cargo.toml                 |  2 +-
 src/gxhash/mod.rs              |  5 ++
 src/gxhash/platform/mod.rs     | 13 ++++-
 src/gxhash/platform/x86_128.rs | 91 ++++++++++++++++++++++++++++++++++
 src/gxhash/platform/x86_256.rs | 17 ++++---
 9 files changed, 156 insertions(+), 21 deletions(-)
 create mode 100644 .cargo/config
 create mode 100644 benches/fnv.rs
 create mode 100644 src/gxhash/platform/x86_128.rs

diff --git a/.cargo/config b/.cargo/config
new file mode 100644
index 0000000..d5135e9
--- /dev/null
+++ b/.cargo/config
@@ -0,0 +1,2 @@
+[build]
+rustflags = ["-C", "target-cpu=native"]
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 78e5fdb..4af710b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,8 +4,11 @@ author = "Olivier Giniaux"
 version = "0.1.0"
 edition = "2021"
 
-[profile.release]
-debug = true
+[features]
+# The 256-bit state GxHash is faster for large inputs than the default 128-bit state implementation.
+# Please not however that the 256-bit GxHash and the 128-bit GxHash don't generate the same hashes for a same input.
+# Requires AVX2 and VAES (X86).
+256-bit = []
 
 [dependencies]
 rand = "0.8"
@@ -14,6 +17,7 @@ rand = "0.8"
 rstest = "0.18.2"
 criterion = { version = "0.5.1" }
 lazy_static = { version = "1.3" }
+# Other hash algorithms, for comparison.
 ahash = "0.8.3"
 t1ha = "0.1.0"
 twox-hash = "1.6.3"
diff --git a/benches/fnv.rs b/benches/fnv.rs
new file mode 100644
index 0000000..cd4713c
--- /dev/null
+++ b/benches/fnv.rs
@@ -0,0 +1,14 @@
+const INITIAL_STATE: u64 = 0xcbf29ce484222325;
+const PRIME: u64 = 0x100000001b3;
+
+#[inline]
+pub const fn fnv_hash(bytes: &[u8]) -> u64 {
+    let mut hash = INITIAL_STATE;
+    let mut i = 0;
+    while i < bytes.len() {
+        hash = hash ^ (bytes[i] as u64);
+        hash = hash.wrapping_mul(PRIME);
+        i += 1;
+    }
+    hash
+}
\ No newline at end of file
diff --git a/benches/throughput.rs b/benches/throughput.rs
index be5147a..acd3d0b 100644
--- a/benches/throughput.rs
+++ b/benches/throughput.rs
@@ -1,19 +1,19 @@
-#![feature(build_hasher_simple_hash_one)]
-
 use std::time::Duration;
 use std::alloc::{alloc, dealloc, Layout};
 use std::slice;
 
-use criterion::measurement::{WallTime};
+use criterion::measurement::WallTime;
 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput, PlotConfiguration, AxisScale, BenchmarkGroup, BenchmarkId};
-use gxhash::*;
 use rand::Rng;
 
+use gxhash::*;
+mod fnv;
+
 fn benchmark<F>(c: &mut BenchmarkGroup<WallTime>, data: &[u8], name: &str, delegate: F)
     where F: Fn(&[u8], i32) -> u64
 {
     for i in 1..16 {
-        let len = usize::pow(2, i);
+        let len = usize::pow(4, i);
 
         c.throughput(Throughput::Bytes(len as u64));
 
@@ -45,9 +45,9 @@ fn benchmark_all(c: &mut Criterion) {
     group.plot_config(plot_config);
 
     // GxHash0
-    benchmark(&mut group, slice, "gxhash0", |data: &[u8], _: i32| -> u64 {
-        gxhash0_64(data, 0)
-    });
+    // benchmark(&mut group, slice, "gxhash0", |data: &[u8], _: i32| -> u64 {
+    //     gxhash0_64(data, 0)
+    // });
 
     // GxHash1
     benchmark(&mut group, slice, "gxhash1", |data: &[u8], _: i32| -> u64 {
@@ -55,9 +55,9 @@ fn benchmark_all(c: &mut Criterion) {
     });
     
     // AHash
-    let build_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0);
+    let ahash_hasher = ahash::RandomState::with_seeds(0, 0, 0, 0);
     benchmark(&mut group, slice, "ahash", |data: &[u8], _: i32| -> u64 {
-        build_hasher.hash_one(data)
+        ahash_hasher.hash_one(data)
     });
 
     // T1ha0
@@ -70,6 +70,11 @@ fn benchmark_all(c: &mut Criterion) {
         twox_hash::xxh3::hash64_with_seed(data, seed as u64)
     });
 
+    // FNV-1a
+    benchmark(&mut group, slice, "fnv-1a", |data: &[u8], _: i32| -> u64 {
+        fnv::fnv_hash(data)
+    });
+
     group.finish();
 
     // Free benchmark data
diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml
index d744be9..c0fdc90 100644
--- a/ffi/Cargo.toml
+++ b/ffi/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 
 [lib]
 name = "gxhash"
-crate-type = ["cdylib"]
+crate-type = ["cdylib", "staticlib"]
 
 [dependencies]
 gxhash = { path = "../", default-features = false }
\ No newline at end of file
diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs
index 0897436..75d57b3 100644
--- a/src/gxhash/mod.rs
+++ b/src/gxhash/mod.rs
@@ -30,6 +30,11 @@ pub fn gxhash1_64(input: &[u8], seed: i32) -> u64 {
     unsafe {
         let p = &gxhash::<1>(input, seed) as *const state as *const u64;
         *p
+
+        // Alternative idea is to extract the center, to avoid xoring for 256 bit state
+        // let p = &gxhash::<1>(input, seed) as *const state as *const u8;
+        // let shifted_ptr = p.offset(3) as *const u64;
+        // *shifted_ptr
     }
 }
 
diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs
index dced5b6..230c2b2 100644
--- a/src/gxhash/platform/mod.rs
+++ b/src/gxhash/platform/mod.rs
@@ -2,8 +2,19 @@
 #[path = "arm_128.rs"]
 pub mod platform;
 
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(
+    feature = "256-bit",
+    target_arch = "x86_64",
+    target_feature = "avx2")
+)]
 #[path = "x86_256.rs"]
 pub mod platform;
 
+#[cfg(all(
+    not(feature = "256-bit"),
+    target_arch = "x86_64"
+))]
+#[path = "x86_128.rs"]
+pub mod platform;
+
 pub use platform::*;
\ No newline at end of file
diff --git a/src/gxhash/platform/x86_128.rs b/src/gxhash/platform/x86_128.rs
new file mode 100644
index 0000000..d9f71dc
--- /dev/null
+++ b/src/gxhash/platform/x86_128.rs
@@ -0,0 +1,91 @@
+use core::arch::x86_64::*;
+use std::mem::size_of;
+
+pub type state = __m128i;
+
+#[inline]
+pub unsafe fn create_empty() -> state {
+    _mm_setzero_si128()
+}
+
+#[inline]
+pub unsafe fn prefetch(p: *const state) {
+    _mm_prefetch(p as *const i8, 3);
+}
+
+#[inline]
+pub unsafe fn load_unaligned(p: *const state) -> state {
+    _mm_loadu_si128(p)
+}
+
+#[inline]
+pub unsafe fn get_partial(p: *const state, len: isize) -> state {
+    let partial_vector: state;
+    // Safety check
+    if check_same_page(p) {
+        let indices = _mm_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+        );
+
+        let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
+        partial_vector = _mm_and_si128(_mm_loadu_si128(p), mask);
+    } else {
+        partial_vector = get_partial_safe(p as *const u8, len as usize)
+    }
+    // Prevents padded zeroes to introduce bias
+    _mm_add_epi32(partial_vector, _mm_set1_epi32(len as i32))
+}
+
+#[inline]
+unsafe fn check_same_page(ptr: *const state) -> bool {
+    let address = ptr as usize;
+    // Mask to keep only the last 12 bits (3 bytes)
+    let offset_within_page = address & 0xFFF;
+    // Check if the 32nd byte from the current offset exceeds the page boundary
+    offset_within_page <= (4096 - size_of::<state>() - 1)
+}
+
+#[inline]
+unsafe fn get_partial_safe(data: *const u8, len: usize) -> state {
+    // Temporary buffer filled with zeros
+    let mut buffer = [0u8; size_of::<state>()];
+    // Copy data into the buffer
+    std::ptr::copy(data, buffer.as_mut_ptr(), len);
+    // Load the buffer into a __m256i vector
+    _mm_loadu_si128(buffer.as_ptr() as *const state)
+}
+
+#[inline]
+#[allow(overflowing_literals)]
+pub unsafe fn compress_1(a: state, b: state) -> state {
+    let keys_1 = _mm_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542);
+    let keys_2 = _mm_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9);
+
+    // 2+1 rounds of AES for compression
+    let mut b = _mm_aesenc_si128(b, keys_1);
+    b = _mm_aesenc_si128(b, keys_2);
+    return _mm_aesenclast_si128(a, b);
+}
+
+#[inline]
+#[allow(overflowing_literals)]
+pub unsafe fn compress_0(a: state, b: state) -> state {
+    return _mm_aesenc_si128(a, b);
+}
+
+#[inline]
+#[allow(overflowing_literals)]
+pub unsafe fn finalize(hash: state, seed: i32) -> state {
+    // Hardcoded AES keys
+    let keys_1 = _mm_set_epi32(0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85);
+    let keys_2 = _mm_set_epi32(0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F);
+    let keys_3 = _mm_set_epi32(0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32);
+
+    // 4 rounds of AES
+    let mut hash = _mm_aesenc_si128(hash, _mm_set1_epi32(seed));
+    hash = _mm_aesenc_si128(hash, keys_1);
+    hash = _mm_aesenc_si128(hash, keys_2);
+    hash = _mm_aesenclast_si128(hash, keys_3);
+
+    hash
+}
\ No newline at end of file
diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs
index 2f557f2..c5ae5ac 100644
--- a/src/gxhash/platform/x86_256.rs
+++ b/src/gxhash/platform/x86_256.rs
@@ -10,7 +10,7 @@ pub unsafe fn create_empty() -> state {
 
 #[inline]
 pub unsafe fn prefetch(p: *const state) {
-    _mm_prefetch(p as *const i8, 3);
+    //_mm_prefetch(p as *const i8, 3);
 }
 
 #[inline]
@@ -24,10 +24,7 @@ pub unsafe fn get_partial(p: *const state, len: isize) -> state {
     // Safety check
     if check_same_page(p) {
         let indices = _mm256_setr_epi8(
-            0, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31
+            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
         );
 
         let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices);
@@ -36,7 +33,7 @@ pub unsafe fn get_partial(p: *const state, len: isize) -> state {
         partial_vector = get_partial_safe(p as *const u8, len as usize)
     }
     // Prevents padded zeroes to introduce bias
-    _mm256_add_epi32(partial_vector, _mm256_set1_epi32(len as i32))
+    _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8))
 }
 
 #[inline]
@@ -60,7 +57,7 @@ unsafe fn get_partial_safe(data: *const u8, len: usize) -> state {
 
 #[inline]
 #[allow(overflowing_literals)]
-pub unsafe fn compress(a: state, b: state) -> state {
+pub unsafe fn compress_1(a: state, b: state) -> state {
     let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781);
     let keys_2 = _mm256_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9, 0x7A83D76B, 0xB1E8F9F0, 0x028925A8, 0x3B9A4E71);
 
@@ -70,6 +67,12 @@ pub unsafe fn compress(a: state, b: state) -> state {
     return _mm256_aesenclast_epi128(a, b);
 }
 
+#[inline]
+#[allow(overflowing_literals)]
+pub unsafe fn compress_0(a: state, b: state) -> state {
+    return _mm256_aesenc_epi128(a, b);
+}
+
 #[inline]
 #[allow(overflowing_literals)]
 pub unsafe fn finalize(hash: state, seed: i32) -> state {

From 5032fa439205f84daa9ae99d6e24951f8dc07512 Mon Sep 17 00:00:00 2001
From: Olivier Giniaux <oginiaux@gmail.com>
Date: Sun, 29 Oct 2023 01:08:41 +0200
Subject: [PATCH 9/9] Add highwayhash to benchmark

---
 Cargo.toml            |  1 +
 benches/throughput.rs | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 4af710b..42ac185 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ lazy_static = { version = "1.3" }
 ahash = "0.8.3"
 t1ha = "0.1.0"
 twox-hash = "1.6.3"
+highway = "1.1.0"
 
 [[bench]]
 name = "throughput"
diff --git a/benches/throughput.rs b/benches/throughput.rs
index acd3d0b..48f9609 100644
--- a/benches/throughput.rs
+++ b/benches/throughput.rs
@@ -12,20 +12,19 @@ mod fnv;
 fn benchmark<F>(c: &mut BenchmarkGroup<WallTime>, data: &[u8], name: &str, delegate: F)
     where F: Fn(&[u8], i32) -> u64
 {
-    for i in 1..16 {
+    for i in 1.. {
         let len = usize::pow(4, i);
+        if len > data.len() {
+            break;
+        }  
 
         c.throughput(Throughput::Bytes(len as u64));
 
-        let aligned_slice = &data[0..len];
-        c.bench_with_input(BenchmarkId::new(name, len), aligned_slice, |bencher, input| {
+        let slice = &data[0..len]; // Aligned
+        // let slice = &data[1..len]; // Unaligned
+        c.bench_with_input(BenchmarkId::new(name, len), slice, |bencher, input| {
             bencher.iter(|| black_box(delegate(input, 0)))
         });
-
-        // let unaligned_slice = &slice[1..len];
-        // group.bench_with_input(format!("{} bytes (unaligned)", len), unaligned_slice, |bencher, input| {
-        //     bencher.iter(|| black_box(gxhash(input)))
-        // });
     }
 }
 
@@ -50,7 +49,7 @@ fn benchmark_all(c: &mut Criterion) {
     // });
 
     // GxHash1
-    benchmark(&mut group, slice, "gxhash1", |data: &[u8], _: i32| -> u64 {
+    benchmark(&mut group, slice, "gxhash", |data: &[u8], _: i32| -> u64 {
         gxhash1_64(data, 0)
     });
     
@@ -70,6 +69,12 @@ fn benchmark_all(c: &mut Criterion) {
         twox_hash::xxh3::hash64_with_seed(data, seed as u64)
     });
 
+    // HighwayHash
+    benchmark(&mut group, slice, "highwayhash", |data: &[u8], _: i32| -> u64 {
+        use highway::{HighwayHasher, HighwayHash};
+        HighwayHasher::default().hash64(data)
+    });
+
     // FNV-1a
     benchmark(&mut group, slice, "fnv-1a", |data: &[u8], _: i32| -> u64 {
         fnv::fnv_hash(data)