From 1e9e5a2cbf9b5ba43730cb634ff4b913b0e16324 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Oct 2024 16:02:36 -0400 Subject: [PATCH] Add better documentation, examples and builer-style API to `ByteView` (#6479) * Add better documentation, examples and builer-style API to `ByteView` * simplify tests * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Moar inline * Update arrow-data/src/byte_view.rs Co-authored-by: Xiangpeng Hao * fmt * Update test so it fails at the right place --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Xiangpeng Hao --- arrow-array/src/array/byte_view_array.rs | 29 +++++----- arrow-data/src/byte_view.rs | 72 +++++++++++++++++++++++- 2 files changed, 84 insertions(+), 17 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index c53478d8b057..b1b5580577ab 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -44,8 +44,11 @@ use super::ByteArrayType; /// /// # See Also /// -/// See [`StringViewArray`] for storing utf8 encoded string data and -/// [`BinaryViewArray`] for storing bytes. +/// * [`StringViewArray`] for storing utf8 encoded string data +/// * [`BinaryViewArray`] for storing bytes +/// * [`ByteView`] to interpret `u128`s layout of the views. +/// +/// [`ByteView`]: arrow_data::ByteView /// /// # Notes /// @@ -872,12 +875,9 @@ mod tests { #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] fn new_with_invalid_view_data() { let v = "large payload over 12 bytes"; - let view = ByteView { - length: 13, - prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()), - buffer_index: 3, - offset: 1, - }; + let view = ByteView::new(13, &v.as_bytes()[0..4]) + .with_buffer_index(3) + .with_offset(1); let views = ScalarBuffer::from(vec![view.into()]); let buffers = vec![Buffer::from_slice_ref(v)]; StringViewArray::new(views, buffers, None); @@ -888,13 +888,12 @@ mod tests { expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" )] fn new_with_invalid_utf8_data() { - let v: Vec = vec![0xf0, 0x80, 0x80, 0x80]; - let view = ByteView { - length: v.len() as u32, - prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), - buffer_index: 0, - offset: 0, - }; + let v: Vec = vec![ + // invalid UTF8 + 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + let view = ByteView::new(v.len() as u32, &v[0..4]); let views = ScalarBuffer::from(vec![view.into()]); let buffers = vec![Buffer::from_slice_ref(v)]; StringViewArray::new(views, buffers, None); diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 6f6d6d175689..3b3ec6246066 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -21,8 +21,40 @@ use arrow_schema::ArrowError; /// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and /// `BinaryViewArray`) where the length is greater than 12 bytes. /// -/// See the documentation on [`GenericByteViewArray`] for more information on -/// the layout of the views. +/// See Also: +/// * [`GenericByteViewArray`] for more information on the layout of the views. +/// * [`validate_binary_view`] and [`validate_string_view`] to validate +/// +/// # Example: Create a new u128 view +/// +/// ```rust +/// # use arrow_data::ByteView;; +/// // Create a view for a string of length 20 +/// // first four bytes are "Rust" +/// // stored in buffer 3 +/// // at offset 42 +/// let prefix = "Rust"; +/// let view = ByteView::new(20, prefix.as_bytes()) +/// .with_buffer_index(3) +/// .with_offset(42); +/// +/// // create the final u128 +/// let v = view.as_u128(); +/// assert_eq!(v, 0x2a000000037473755200000014); +/// ``` +/// +/// # Example: decode a `u128` into its constituent fields +/// ```rust +/// # use arrow_data::ByteView; +/// // Convert a u128 to a ByteView +/// // See validate_{string,binary}_view functions to validate +/// let v = ByteView::from(0x2a000000037473755200000014); +/// +/// assert_eq!(v.length, 20); +/// assert_eq!(v.prefix, 0x74737552); +/// assert_eq!(v.buffer_index, 3); +/// assert_eq!(v.offset, 42); +/// ``` /// /// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html #[derive(Debug, Copy, Clone, Default)] @@ -39,6 +71,42 @@ pub struct ByteView { } impl ByteView { + /// Construct a [`ByteView`] for data `length` of bytes with the specified prefix. + /// + /// See example on [`ByteView`] docs + /// + /// Notes: + /// * the length should always be greater than 12 (Data less than 12 + /// bytes is stored as an inline view) + /// * buffer and offset are set to `0` + /// + /// # Panics + /// If the prefix is not exactly 4 bytes + #[inline] + pub fn new(length: u32, prefix: &[u8]) -> Self { + debug_assert!(length > 12); + Self { + length, + prefix: u32::from_le_bytes(prefix.try_into().unwrap()), + buffer_index: 0, + offset: 0, + } + } + + /// Set the [`Self::buffer_index`] field + #[inline] + pub fn with_buffer_index(mut self, buffer_index: u32) -> Self { + self.buffer_index = buffer_index; + self + } + + /// Set the [`Self::offset`] field + #[inline] + pub fn with_offset(mut self, offset: u32) -> Self { + self.offset = offset; + self + } + #[inline(always)] /// Convert `ByteView` to `u128` by concatenating the fields pub fn as_u128(self) -> u128 {