diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index bb0fb5e91be2..d71542c1b714 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -195,7 +195,16 @@ where K: ArrowDictionaryKeyType, T: ByteArrayType, { - fn get_or_insert_key(&mut self, value: impl AsRef) -> Result { + /// Only insert value **without inserting key** and get the key of that value (value index) + /// + /// This should be used for user optimization + /// + /// Returns an error if the new index would overflow the key type. + /// + /// # Safety + /// This function is unsafe as calling this without calling one of the `append_key` functions will result in an undefined behavior + /// + pub unsafe fn get_or_insert_key(&mut self, value: impl AsRef) -> Result { let value_native: &T::Native = value.as_ref(); let value_bytes: &[u8] = value_native.as_ref(); @@ -222,15 +231,68 @@ where Ok(key) } + + /// Only insert key that uses a value in a specific index + /// + /// This should be used for user optimization + /// + /// # Panic + /// this will panic if the value index is out of bounds + /// + pub fn append_key_for_existing_value(&mut self, value_index: K::Native) { + assert!(value_index.as_usize() < self.values_builder.len(), "value_index is outside values bound"); + unsafe { + self.append_key_for_existing_value_unchecked(value_index); + } + } + + /// Only insert key that uses a value in a specific index without checking if `value_index` is in the bounds + /// + /// This should be used for user optimization + /// + /// # Safety + /// The user must ensure that the value index is valid + /// + pub unsafe fn append_key_for_existing_value_unchecked(&mut self, value_index: K::Native) { + self.keys_builder.append_value(value_index); + } + + /// Only insert key that uses a value in a specific index N times + /// + /// This should be used for user optimization + /// + /// # Panic + /// this will panic if the value index is out of bounds + /// + pub fn append_key_n_for_existing_value(&mut self, value_index: K::Native, count: usize) { + assert!(value_index.as_usize() < self.values_builder.len(), "value_index is outside values bound"); + unsafe { + self.append_key_n_for_existing_value_unchecked(value_index, count); + } + } + + /// Only insert key that uses a value in a specific index `N` times without checking if `value_index` is in the bounds + /// + /// This should be used for user optimization + /// + /// # Safety + /// The user must ensure that the value index is valid + /// + pub unsafe fn append_key_n_for_existing_value_unchecked(&mut self, value_index: K::Native, count: usize) { + self.keys_builder.append_value_n(value_index, count); + } + /// Append a value to the array. Return an existing index /// if already present in the values array or a new index if the /// value is appended to the values array. /// /// Returns an error if the new index would overflow the key type. pub fn append(&mut self, value: impl AsRef) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value(key); - Ok(key) + unsafe { + let key = self.get_or_insert_key(value)?; + self.append_key_for_existing_value_unchecked(key); + Ok(key) + } } /// Append a value multiple times to the array. @@ -242,9 +304,11 @@ where value: impl AsRef, count: usize, ) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value_n(key, count); - Ok(key) + unsafe { + let key = self.get_or_insert_key(value)?; + self.append_key_n_for_existing_value_unchecked(key, count); + Ok(key) + } } /// Infallibly append a value to this builder @@ -446,7 +510,7 @@ mod tests { use crate::array::Int8Array; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{BinaryArray, GenericStringArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -664,4 +728,44 @@ mod tests { assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); assert_eq!(dict.values().len(), 4); } + + #[test] + #[should_panic(expected = "value_index is outside values bound")] + fn append_key_for_existing_value_should_panic_on_invalid_index() { + let mut builder = + GenericByteDictionaryBuilder::>::with_capacity(257, 257, 257); + + builder.append_key_for_existing_value(0); + } + + #[test] + #[should_panic(expected = "value_index is outside values bound")] + fn append_key_n_for_existing_value_should_panic_on_invalid_index() { + let mut builder = + GenericByteDictionaryBuilder::>::with_capacity(257, 257, 257); + + builder.append_key_n_for_existing_value(0, 2); + } + + #[test] + fn manual_appending() { + let mut builder = + GenericByteDictionaryBuilder::>::with_capacity(257, 257, 257); + + let value = "123"; + let key = unsafe { builder.get_or_insert_key(value).unwrap() }; + builder.append_key_for_existing_value(key); + + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [Some(value)]); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac40f8a469d3..3ba915c268df 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -209,8 +209,18 @@ where K: ArrowDictionaryKeyType, V: ArrowPrimitiveType, { + + /// Only insert value **without inserting key** and get the key of that value (value index) + /// + /// This should be used for user optimization + /// + /// Returns an error if the new index would overflow the key type. + /// + /// # Safety + /// This function is unsafe as calling this without calling one of the `append_key` functions will result in an undefined behavior + /// #[inline] - fn get_or_insert_key(&mut self, value: V::Native) -> Result { + pub unsafe fn get_or_insert_key(&mut self, value: V::Native) -> Result { match self.map.get(&Value(value)) { Some(&key) => { Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) @@ -224,14 +234,66 @@ where } } + /// Only insert key that uses a value in a specific index + /// + /// This should be used for user optimization + /// + /// # Panic + /// this will panic if the value index is out of bounds + /// + pub fn append_key_for_existing_value(&mut self, value_index: K::Native) { + assert!(value_index.as_usize() < self.values_builder.len(), "value_index is outside values bound"); + unsafe { + self.append_key_for_existing_value_unchecked(value_index); + } + } + + /// Only insert key that uses a value in a specific index without checking if `value_index` is in the bounds + /// + /// This should be used for user optimization + /// + /// # Safety + /// The user must ensure that the value index is valid + /// + pub unsafe fn append_key_for_existing_value_unchecked(&mut self, value_index: K::Native) { + self.keys_builder.append_value(value_index); + } + + /// Only insert key that uses a value in a specific index N times + /// + /// This should be used for user optimization + /// + /// # Panic + /// this will panic if the value index is out of bounds + /// + pub fn append_key_n_for_existing_value(&mut self, value_index: K::Native, count: usize) { + assert!(value_index.as_usize() < self.values_builder.len(), "value_index is outside values bound"); + unsafe { + self.append_key_n_for_existing_value_unchecked(value_index, count); + } + } + + /// Only insert key that uses a value in a specific index `N` times without checking if `value_index` is in the bounds + /// + /// This should be used for user optimization + /// + /// # Safety + /// The user must ensure that the value index is valid + /// + pub unsafe fn append_key_n_for_existing_value_unchecked(&mut self, value_index: K::Native, count: usize) { + self.keys_builder.append_value_n(value_index, count); + } + /// Append a primitive value to the array. Return an existing index /// if already present in the values array or a new index if the /// value is appended to the values array. #[inline] pub fn append(&mut self, value: V::Native) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value(key); - Ok(key) + unsafe { + let key = self.get_or_insert_key(value)?; + self.append_key_for_existing_value_unchecked(key); + Ok(key) + } } /// Append a value multiple times to the array. @@ -239,9 +301,11 @@ where /// /// Returns an error if the new index would overflow the key type. pub fn append_n(&mut self, value: V::Native, count: usize) -> Result { - let key = self.get_or_insert_key(value)?; - self.keys_builder.append_value_n(key, count); - Ok(key) + unsafe { + let key = self.get_or_insert_key(value)?; + self.append_key_n_for_existing_value_unchecked(key, count); + Ok(key) + } } /// Infallibly append a value to this builder @@ -443,4 +507,44 @@ mod tests { ) ); } + + #[test] + #[should_panic(expected = "value_index is outside values bound")] + fn append_key_for_existing_value_should_panic_on_invalid_index() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(257, 257); + + builder.append_key_for_existing_value(0); + } + + #[test] + #[should_panic(expected = "value_index is outside values bound")] + fn append_key_n_for_existing_value_should_panic_on_invalid_index() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(257, 257); + + builder.append_key_n_for_existing_value(0, 2); + } + + #[test] + fn manual_appending() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(257, 257); + + let value = 123; + let key = unsafe { builder.get_or_insert_key(value).unwrap() }; + builder.append_key_for_existing_value(key); + + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [Some(value)]); + } }