-
Notifications
You must be signed in to change notification settings - Fork 833
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
impl std::fmt::Write for StringViewBuilder / BinaryViewBuilder #6777
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,20 +15,18 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use std::any::Any; | ||
use std::marker::PhantomData; | ||
use std::sync::Arc; | ||
|
||
use crate::builder::ArrayBuilder; | ||
use crate::types::bytes::ByteArrayNativeType; | ||
use crate::types::{BinaryViewType, ByteViewType, StringViewType}; | ||
use crate::{ArrayRef, GenericByteViewArray}; | ||
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; | ||
use arrow_data::ByteView; | ||
use arrow_schema::ArrowError; | ||
use hashbrown::hash_table::Entry; | ||
use hashbrown::HashTable; | ||
|
||
use crate::builder::ArrayBuilder; | ||
use crate::types::bytes::ByteArrayNativeType; | ||
use crate::types::{BinaryViewType, ByteViewType, StringViewType}; | ||
use crate::{ArrayRef, GenericByteViewArray}; | ||
use std::any::Any; | ||
use std::marker::PhantomData; | ||
use std::sync::Arc; | ||
|
||
const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB | ||
const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB | ||
|
@@ -88,6 +86,9 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> { | |
/// map `<string hash> -> <index to the views>` | ||
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>, | ||
phantom: PhantomData<T>, | ||
/// temporary space used by [`ByteViewFormatter`] | ||
/// to avoid extra allocations | ||
temp_buffer: Option<String>, | ||
} | ||
|
||
impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | ||
|
@@ -108,6 +109,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | |
}, | ||
string_tracker: None, | ||
phantom: Default::default(), | ||
temp_buffer: None, | ||
} | ||
} | ||
|
||
|
@@ -281,7 +283,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | |
/// - String length exceeds `u32::MAX` | ||
#[inline] | ||
pub fn append_value(&mut self, value: impl AsRef<T::Native>) { | ||
let v: &[u8] = value.as_ref().as_ref(); | ||
// SAFETY: the value is a valid native type (e.g. `&str` or `&[u8]`) | ||
unsafe { self.append_bytes(value.as_ref().as_ref()) } | ||
} | ||
|
||
/// Appends a &str into the builder | ||
/// | ||
/// # Panics | ||
/// | ||
/// Panics if | ||
/// - String buffer count exceeds `u32::MAX` | ||
/// - String length exceeds `u32::MAX` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You would likely run out of stack space first :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe heap space 🤔 |
||
#[inline] | ||
pub fn append_str(&mut self, value: &str) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There may be a fancier way to write the generics to avoid introdicing a |
||
// SAFETY: UTF8 bytes are valid for both string and binary | ||
unsafe { self.append_bytes(value.as_bytes()) } | ||
} | ||
|
||
/// Appends bytes into this builder. | ||
/// | ||
/// Safety: caller must ensure value is a valid native type (e.g. valid | ||
/// UTF-8 for `StringViewType`). | ||
unsafe fn append_bytes(&mut self, v: &[u8]) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could potentially mark this as |
||
let length: u32 = v.len().try_into().unwrap(); | ||
if length <= 12 { | ||
let mut view_buffer = [0; 16]; | ||
|
@@ -406,6 +429,12 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | |
}; | ||
buffer_size + in_progress + tracker + views + null | ||
} | ||
|
||
/// Return a structure that implements `std::fmt::Write` to write strings | ||
/// directly to the builder. See example on [`StringViewBuilder`] | ||
pub fn formatter(&mut self) -> ByteViewFormatter<'_, T> { | ||
ByteViewFormatter::new(self) | ||
} | ||
} | ||
|
||
impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> { | ||
|
@@ -466,29 +495,69 @@ impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>> | |
/// Array builder for [`StringViewArray`][crate::StringViewArray] | ||
/// | ||
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with | ||
/// [`GenericByteViewBuilder::append_null`] as normal. | ||
/// [`GenericByteViewBuilder::append_null`]. | ||
/// | ||
/// # Example | ||
/// Calling [`Self::formatter`] returns a object that implements | ||
/// [`std::fmt::Write`] which allows using [`std::fmt::Display`] with standard | ||
/// Rust idioms like `write!` and `writeln!` to write data directly to the | ||
/// builder without additional intermediate allocations. | ||
/// | ||
/// # Example writing strings with `append_value` | ||
/// ``` | ||
/// # use arrow_array::builder::StringViewBuilder; | ||
/// # use arrow_array::StringViewArray; | ||
/// let mut builder = StringViewBuilder::new(); | ||
/// builder.append_value("hello"); | ||
/// builder.append_null(); | ||
/// builder.append_value("world"); | ||
/// builder.append_value("hello"); // row 0 is a value | ||
/// builder.append_null(); // row 1 is null | ||
/// builder.append_value("world"); // row 2 is a value | ||
/// let array = builder.finish(); | ||
/// | ||
/// let expected = vec![Some("hello"), None, Some("world")]; | ||
/// let actual: Vec<_> = array.iter().collect(); | ||
/// assert_eq!(expected, actual); | ||
/// ``` | ||
/// | ||
/// /// # Example incrementally writing strings using [`Self::formatter`] and `write!` | ||
/// ``` | ||
/// # use std::fmt::Write; | ||
/// # use arrow_array::builder::StringViewBuilder; | ||
/// # use arrow_array::Array; | ||
/// let mut builder = StringViewBuilder::new(); | ||
/// // Write data in multiple `write!` calls | ||
/// let mut formatter = builder.formatter(); | ||
/// write!(formatter, "foo").unwrap(); | ||
/// write!(formatter, "bar").unwrap(); | ||
/// // Dropping the formatter writes the value to the builder | ||
/// drop(formatter); | ||
/// | ||
/// // Write second value with a single write call. | ||
/// // This does not require any new allocations. | ||
/// let mut formatter = builder.formatter(); | ||
/// write!(formatter, "v2").unwrap(); | ||
/// // finish the value by dropping the formatter | ||
/// drop(formatter); | ||
/// | ||
/// // Note that simply creating a formatter creates a new value | ||
/// // even if no data is written | ||
/// let mut formatter = builder.formatter(); | ||
/// drop(formatter); | ||
/// | ||
/// let array = builder.finish(); | ||
/// assert_eq!(array.value(0), "foobar"); | ||
/// assert_eq!(array.value(1), "v2"); | ||
/// assert_eq!(array.value(2), ""); | ||
/// assert_eq!(array.len(), 3); | ||
/// ``` | ||
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; | ||
|
||
/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] | ||
/// | ||
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with | ||
/// [`GenericByteViewBuilder::append_null`] as normal. | ||
/// | ||
/// Similarly to [`StringViewBuilder`], calling [`Self::formatter`] returns a | ||
/// object that implements [`std::fmt::Write`]. | ||
/// | ||
/// # Example | ||
/// ``` | ||
/// # use arrow_array::builder::BinaryViewBuilder; | ||
|
@@ -504,6 +573,23 @@ pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; | |
/// assert_eq!(expected, actual); | ||
/// ``` | ||
/// | ||
/// # Example incrementally writing strings using [`Self::formatter`] and `write!` | ||
/// (See [`StringViewBuilder`] for more details) | ||
/// ``` | ||
/// # use std::fmt::Write; | ||
/// # use arrow_array::builder::BinaryViewBuilder; | ||
/// # use arrow_array::Array; | ||
/// let mut builder = BinaryViewBuilder::new(); | ||
/// // Write data in multiple `write!` calls | ||
/// let mut formatter = builder.formatter(); | ||
/// write!(formatter, "foo").unwrap(); | ||
/// write!(formatter, "bar").unwrap(); | ||
/// // Dropping the formatter writes the value to the builder | ||
/// drop(formatter); | ||
/// | ||
/// let array = builder.finish(); | ||
/// assert_eq!(array.value(0), b"foobar"); | ||
/// ``` | ||
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>; | ||
|
||
/// Creates a view from a fixed length input (the compiler can generate | ||
|
@@ -553,6 +639,56 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { | |
} | ||
} | ||
|
||
/// See [GenericByteViewBuilder::formatter] for more details | ||
#[derive(Debug)] | ||
pub struct ByteViewFormatter<'a, T> | ||
where | ||
T: ByteViewType + ?Sized, | ||
{ | ||
/// reference to the builder | ||
inner: &'a mut GenericByteViewBuilder<T>, | ||
/// buffer where the in progress string is written | ||
buffer: String, | ||
} | ||
|
||
impl<'a, T> ByteViewFormatter<'a, T> | ||
where | ||
T: ByteViewType + ?Sized, | ||
{ | ||
fn new(inner: &'a mut GenericByteViewBuilder<T>) -> Self { | ||
let buffer = match std::mem::take(&mut inner.temp_buffer) { | ||
Some(mut buffer) => { | ||
buffer.clear(); | ||
buffer | ||
} | ||
None => String::new(), | ||
}; | ||
|
||
Self { inner, buffer } | ||
} | ||
} | ||
|
||
impl<'a, T> std::fmt::Write for ByteViewFormatter<'a, T> | ||
where | ||
T: ByteViewType + ?Sized, | ||
{ | ||
fn write_str(&mut self, s: &str) -> std::fmt::Result { | ||
self.buffer.write_str(s) | ||
} | ||
} | ||
|
||
/// When a StringViewWriter is dropped, it writes the value to the StringViewBuilder | ||
impl<'a, T> Drop for ByteViewFormatter<'a, T> | ||
where | ||
T: ByteViewType + ?Sized, | ||
{ | ||
fn drop(&mut self) { | ||
self.inner.append_str(&self.buffer); | ||
// save the buffer to avoid an allocation on subsequent writes | ||
self.inner.temp_buffer = Some(std::mem::take(&mut self.buffer)); | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use core::str; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This implementation allocates a single String which is used for all
write!
calls to the same StringViewBuilder, which should keep allocations to a mininum.However, this implementation will copy the strings twice:
String
For short strings (less than 12 bytes) this copy is likely required anyways (unless we are manipulating the view buffer directly)
For longer strings (greater than 12 bytes), we could avoid the extra copy into the
String
buffer by writing directly into the underlying buffer.I actually tried to avoid the second copy, but found it was quite complex due to:
Thus I went with the simple approach here which should get most of the benefits, and figured we can optimize further if needed.