diff --git a/src/common/src/array/data_chunk.rs b/src/common/src/array/data_chunk.rs index 18ad353ec1b80..22d5bb55711ce 100644 --- a/src/common/src/array/data_chunk.rs +++ b/src/common/src/array/data_chunk.rs @@ -344,18 +344,20 @@ impl DataChunk { Ok(outputs) } - /// Compute hash values for each row. + /// Compute hash values for each row. The number of the returning `HashCodes` is `self.capacity()`. + /// When `skip_invisible_row` is true, the `HashCode` for the invisible rows is arbitrary. pub fn get_hash_values( &self, column_idxes: &[usize], hasher_builder: H, ) -> Vec> { - let mut states = Vec::with_capacity(self.capacity()); - states.resize_with(self.capacity(), || hasher_builder.build_hasher()); + let len = self.capacity(); + let mut states = Vec::with_capacity(len); + states.resize_with(len, || hasher_builder.build_hasher()); // Compute hash for the specified columns. for column_idx in column_idxes { let array = self.column_at(*column_idx); - array.hash_vec(&mut states[..]); + array.hash_vec(&mut states[..], self.visibility()); } finalize_hashers(&states[..]) .into_iter() diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs index f1012782bf9a6..fd8d408d5f813 100644 --- a/src/common/src/array/mod.rs +++ b/src/common/src/array/mod.rs @@ -281,10 +281,10 @@ pub trait Array: } } - fn hash_vec(&self, hashers: &mut [H]) { + fn hash_vec(&self, hashers: &mut [H], vis: &Bitmap) { assert_eq!(hashers.len(), self.len()); - for (idx, state) in hashers.iter_mut().enumerate() { - self.hash_at(idx, state); + for idx in vis.iter_ones() { + self.hash_at(idx, &mut hashers[idx]); } } @@ -554,8 +554,8 @@ impl ArrayImpl { dispatch_array_variants!(self, inner, { inner.hash_at(idx, state) }) } - pub fn hash_vec(&self, hashers: &mut [H]) { - dispatch_array_variants!(self, inner, { inner.hash_vec(hashers) }) + pub fn hash_vec(&self, hashers: &mut [H], vis: &Bitmap) { + dispatch_array_variants!(self, inner, { inner.hash_vec(hashers, vis) }) } /// Select some elements from `Array` based on `visibility` bitmap. @@ -711,6 +711,7 @@ mod test_util { use std::hash::{BuildHasher, Hasher}; use super::Array; + use crate::buffer::Bitmap; use crate::util::iter_util::ZipEqFast; pub fn hash_finish(hashers: &[H]) -> Vec { @@ -732,8 +733,9 @@ mod test_util { arr.hash_at(i, state) } }); + let vis = Bitmap::ones(len); arrs.iter() - .for_each(|arr| arr.hash_vec(&mut states_vec[..])); + .for_each(|arr| arr.hash_vec(&mut states_vec[..], &vis)); itertools::cons_tuples( expects .iter() diff --git a/src/common/src/hash/key_v2.rs b/src/common/src/hash/key_v2.rs index 227944d07b3bc..6d64106d556d2 100644 --- a/src/common/src/hash/key_v2.rs +++ b/src/common/src/hash/key_v2.rs @@ -318,14 +318,9 @@ impl HashKey for HashKeyImpl { // Dispatch types once to accelerate the inner call. dispatch_array_variants!(array, array, { - for ((scalar, visible), serializer) in array - .iter() - .zip_eq_fast(data_chunk.visibility().iter()) - .zip_eq_fast(&mut serializers) - { - if visible { - serializer.serialize(scalar); - } + for i in data_chunk.visibility().iter_ones() { + // SAFETY(value_at_unchecked): the idx is always in bound. + unsafe { serializers[i].serialize(array.value_at_unchecked(i)) } } }); } @@ -382,22 +377,16 @@ impl DataChunk { } }) } - - let mut sizes = self - .visibility() - .iter() - .map(|visible| if visible { exact_size } else { 0 }) - .collect_vec(); + let mut sizes = vec![exact_size; self.capacity()]; for i in estimated_column_indices { dispatch_array_variants!(&*self.columns()[i], col, { - for ((datum, visible), size) in col - .iter() - .zip_eq_fast(self.visibility().iter()) - .zip_eq_fast(&mut sizes) - { - if visible && let Some(scalar) = datum { - *size += HashKeySer::estimated_size(scalar); + for i in self.visibility().iter_ones() { + // SAFETY(value_at_unchecked): the idx is always in bound. + unsafe { + if let Some(scalar) = col.value_at_unchecked(i) { + sizes[i] += HashKeySer::estimated_size(scalar); + } } } })