From fc05eca09b57f1a6dc8fa6e8e1390fd77327e8f7 Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Wed, 2 Oct 2024 04:05:28 +0800 Subject: [PATCH] Add `GenericListViewArray` for `ListView` & `LargeListView` (#5723) * Add `GenericListViewArray` for `ListView` & `LargeListView` * rebase master * fix lint * remove rom_iter_primitive * fix fixed array * lint * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Weston Pace * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Weston Pace * fix comment * cargo fmt * fix: check size * chore:remove unuse comment * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * Update arrow-array/src/array/list_view_array.rs Co-authored-by: Matthijs Brobbel * chore: add comment * chore: add comment --------- Co-authored-by: Weston Pace Co-authored-by: Matthijs Brobbel --- arrow-array/src/array/list_view_array.rs | 1035 ++++++++++++++++++++++ arrow-array/src/array/mod.rs | 12 + arrow-array/src/iterator.rs | 5 +- 3 files changed, 1050 insertions(+), 2 deletions(-) create mode 100644 arrow-array/src/array/list_view_array.rs diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs new file mode 100644 index 000000000000..9b9078c23772 --- /dev/null +++ b/arrow-array/src/array/list_view_array.rs @@ -0,0 +1,1035 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_buffer::{NullBuffer, ScalarBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use std::any::Any; +use std::ops::Add; +use std::sync::Arc; + +use crate::array::{make_array, print_long_array}; +use crate::iterator::GenericListViewArrayIter; +use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, FixedSizeListArray, OffsetSizeTrait}; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i32`. +pub type ListViewArray = GenericListViewArray; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. +pub type LargeListViewArray = GenericListViewArray; + +/// +/// Different from [`crate::GenericListArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +#[derive(Clone)] +pub struct GenericListViewArray { + data_type: DataType, + nulls: Option, + values: ArrayRef, + value_offsets: ScalarBuffer, + value_sizes: ScalarBuffer, +} + +impl GenericListViewArray { + /// The data type constructor of listview array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], ListView or LargeListView. + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { + DataType::LargeListView + } else { + DataType::ListView + }; + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() != sizes.len()` + /// * `offsets.len() != nulls.len()` + /// * `offsets[i] > values.len()` + /// * `!field.is_nullable() && values.is_nullable()` + /// * `field.data_type() != values.data_type()` + /// * `0 <= offsets[i] <= length of the child array` + /// * `0 <= offsets[i] + size[i] <= length of the child array` + pub fn try_new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + let len = offsets.len(); + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {len} and {}", + OffsetSize::PREFIX, sizes.len() + ))); + } + + for (offset, size) in offsets.iter().zip(sizes.iter()) { + let offset = offset.as_usize(); + let size = size.as_usize(); + if offset.checked_add(size).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Overflow in offset + size for {}ListViewArray", + OffsetSize::PREFIX + )) + })? > values.len() + { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset + size for {}ListViewArray must be within the bounds of the child array, got offset: {offset}, size: {size}, child array length: {}", + OffsetSize::PREFIX, + values.len() + ))); + } + } + + if !field.is_nullable() && values.is_nullable() { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + value_sizes: sizes, + }) + } + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, sizes, values, nulls).unwrap() + } + + /// Create a new [`GenericListViewArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: ScalarBuffer::from(vec![]), + value_sizes: ScalarBuffer::from(vec![]), + values, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + ScalarBuffer, + ScalarBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::ListView(f) | DataType::LargeListView(f) => f, + _ => unreachable!(), + }; + ( + f, + self.value_offsets, + self.value_sizes, + self.values, + self.nulls, + ) + } + + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &ScalarBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] + pub fn values(&self) -> &ArrayRef { + &self.values + } + + /// Returns a reference to the sizes of this list + /// + /// Unlike [`Self::value_sizes`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn sizes(&self) -> &ScalarBuffer { + &self.value_sizes + } + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_type().clone() + } + + /// Returns ith value of this list view array. + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets().get_unchecked(i).as_usize(); + let length = self.value_sizes().get_unchecked(i).as_usize(); + self.values.slice(offset, length) + } + + /// Returns ith value of this list view array. + /// # Panics + /// Panics if the index is out of bounds + pub fn value(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + &self.value_offsets + } + + /// Returns the sizes values in the offsets buffer + #[inline] + pub fn value_sizes(&self) -> &[OffsetSize] { + &self.value_sizes + } + + /// Returns the size for value at index `i`. + #[inline] + pub fn value_size(&self, i: usize) -> OffsetSize { + self.value_sizes[i] + } + + /// Returns the offset for value at index `i`. + pub fn value_offset(&self, i: usize) -> OffsetSize { + self.value_offsets[i] + } + + /// Constructs a new iterator + pub fn iter(&self) -> GenericListViewArrayIter<'_, OffsetSize> { + GenericListViewArrayIter::<'_, OffsetSize>::new(self) + } + + #[inline] + fn get_type(data_type: &DataType) -> Option<&DataType> { + match (OffsetSize::IS_LARGE, data_type) { + (true, DataType::LargeListView(child)) | (false, DataType::ListView(child)) => { + Some(child.data_type()) + } + _ => None, + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_sizes: self.value_sizes.slice(offset, length), + } + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListViewArray::value_unchecked(self, index) + } +} + +impl Array for GenericListViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.sizes().len() + } + + fn is_empty(&self) -> bool { + self.value_sizes.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } +} + +impl std::fmt::Debug for GenericListViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let prefix = OffsetSize::PREFIX; + write!(f, "{prefix}ListViewArray\n[\n")?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl From> for ArrayData { + fn from(array: GenericListViewArray) -> Self { + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![ + array.value_offsets.into_inner(), + array.value_sizes.into_inner(), + ]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } + } +} + +impl From for GenericListViewArray { + fn from(data: ArrayData) -> Self { + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListViewArray from ArrayDataRef failed") + } +} + +impl From for GenericListViewArray { + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + let mut acc = 0_usize; + let iter = std::iter::repeat(size).take(value.len()); + let mut sizes = Vec::with_capacity(iter.size_hint().0); + let mut offsets = Vec::with_capacity(iter.size_hint().0); + + for size in iter { + offsets.push(OffsetSize::usize_as(acc)); + acc = acc.add(size); + sizes.push(OffsetSize::usize_as(size)); + } + let sizes = ScalarBuffer::from(sizes); + let offsets = ScalarBuffer::from(offsets); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + value_sizes: sizes, + } + } +} + +impl GenericListViewArray { + fn try_new_from_array_data(data: ArrayData) -> Result { + if data.buffers().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray data should contain two buffers (value offsets & value sizes), had {}", + data.buffers().len() + ))); + } + + if data.child_data().len() != 1 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray should contain a single child array (values array), had {}", + data.child_data().len() + ))); + } + + let values = data.child_data()[0].clone(); + + if let Some(child_data_type) = Self::get_type(data.data_type()) { + if values.data_type() != child_data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray's child datatype {:?} does not \ + correspond to the List's datatype {:?}", + OffsetSize::PREFIX, + values.data_type(), + child_data_type + ))); + } + } else { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray's datatype must be {}ListViewArray(). It is {:?}", + OffsetSize::PREFIX, + OffsetSize::PREFIX, + data.data_type() + ))); + } + + let values = make_array(values); + // ArrayData is valid, and verified type above + let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + + Ok(Self { + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + values, + value_offsets, + value_sizes, + }) + } +} + +#[cfg(test)] +mod tests { + use arrow_buffer::{bit_util, BooleanBuffer, Buffer, ScalarBuffer}; + use arrow_schema::Field; + + use crate::builder::{FixedSizeListBuilder, Int32Builder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Int32Array, Int64Array}; + + use super::*; + + #[test] + fn test_empty_list_view_array() { + // Construct an empty value array + let vec: Vec = vec![]; + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![]); + let offsets = ScalarBuffer::from(vec![]); + let values = Int32Array::from(vec); + let list_array = LargeListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + assert_eq!(list_array.len(), 0) + } + + #[test] + fn test_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3i32, 3, 2]); + let offsets = ScalarBuffer::from(vec![0i32, 3, 6]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_size(2)); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_primitive::() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_large_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3i64, 3, 2]); + let offsets = ScalarBuffer::from(vec![0i64, 3, 6]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = LargeListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_size(2)); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_primitive::() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + ListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_size(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + assert_eq!(2, sliced_list_array.value_size(2)); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + assert_eq!(2, sliced_list_array.value_size(3)); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_sizes()[5]); + assert_eq!(3, sliced_list_array.value_size(5)); + } + + #[test] + fn test_large_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + // Construct a large list view array from the above two + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0i64, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + LargeListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_size(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_size(2)); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_size(3)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_size(5)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + } + + #[test] + #[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")] + fn test_list_view_array_index_out_of_bound() { + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + let buffer = BooleanBuffer::new(Buffer::from(null_bits), 0, 9); + let null_buffer = NullBuffer::new(buffer); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + // Construct a list array from the above two + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![2i32, 0, 0, 2, 2, 0, 3, 0, 1]); + let offsets = ScalarBuffer::from(vec![0i32, 2, 2, 2, 4, 6, 6, 9, 9]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let list_array = + ListViewArray::new(field, offsets, sizes, Arc::new(values), Some(null_buffer)); + + assert_eq!(9, list_array.len()); + list_array.value(10); + } + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffers (value offsets & value sizes), had 0" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_buffer_len() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffers (value offsets & value sizes), had 1" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_child_array_len() { + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn test_list_view_array_offsets_need_not_start_at_zero() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![0i32, 0, 3]); + let offsets = ScalarBuffer::from(vec![2i32, 2, 5]); + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let list_array = ListViewArray::new(field, offsets, sizes, Arc::new(values), None); + + assert_eq!(list_array.value_size(0), 0); + assert_eq!(list_array.value_size(1), 0); + assert_eq!(list_array.value_size(2), 3); + } + + #[test] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_alignment() { + let offset_buf = Buffer::from_slice_ref([0_u64]); + let offset_buf2 = offset_buf.slice(1); + + let size_buf = Buffer::from_slice_ref([0_u64]); + let size_buf2 = size_buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(values)) + .build_unchecked() + }; + + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(offset_buf2) + .add_buffer(size_buf2) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn test_empty_offsets() { + let f = Arc::new(Field::new("element", DataType::Int32, true)); + let string = ListViewArray::from( + ArrayData::builder(DataType::ListView(f.clone())) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[]); + assert_eq!(string.value_sizes(), &[]); + + let string = LargeListViewArray::from( + ArrayData::builder(DataType::LargeListView(f)) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[]); + assert_eq!(string.value_sizes(), &[]); + } + + #[test] + fn test_try_new() { + let offsets = ScalarBuffer::from(vec![0, 1, 4, 5]); + let sizes = ScalarBuffer::from(vec![1, 3, 1, 0]); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListViewArray::new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ); + + let nulls = NullBuffer::new_null(4); + ListViewArray::new( + field.clone(), + offsets, + sizes.clone(), + values.clone(), + Some(nulls), + ); + + let nulls = NullBuffer::new_null(4); + let offsets = ScalarBuffer::from(vec![0, 1, 2, 3, 4]); + let sizes = ScalarBuffer::from(vec![1, 1, 1, 1, 0]); + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + Some(nulls), + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for LargeListViewArray, expected 5 got 4" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = LargeListViewArray::try_new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListViewArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" + ); + } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]); + let offsets = list.value_offsets(); + assert_eq!(offsets, &[0, 3, 6]); + let sizes = list.value_sizes(); + assert_eq!(sizes, &[3, 3, 3]); + } + + #[test] + fn test_list_view_array_overlap_lists() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref([0, 3])) // offsets + .add_buffer(Buffer::from_slice_ref([5, 5])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 2); + assert_eq!(array.value_size(0), 5); + assert_eq!(array.value_size(1), 5); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!( + values, + vec![Some(vec![0, 1, 2, 3, 4]), Some(vec![3, 4, 5, 6, 7])] + ); + } + + #[test] + fn test_list_view_array_incomplete_offsets() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(50) + .add_buffer(Buffer::from_slice_ref((0..50).collect::>())) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(Buffer::from_slice_ref([0, 5, 10])) // offsets + .add_buffer(Buffer::from_slice_ref([0, 5, 10])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 3); + assert_eq!(array.value_size(0), 0); + assert_eq!(array.value_size(1), 5); + assert_eq!(array.value_size(2), 10); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!( + values, + vec![ + Some(vec![]), + Some(vec![5, 6, 7, 8, 9]), + Some(vec![10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) + ] + ); + } + + #[test] + fn test_list_view_array_empty_lists() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(0) + .add_buffer(Buffer::from_slice_ref::(&[])) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(Buffer::from_slice_ref([0, 0, 0])) // offsets + .add_buffer(Buffer::from_slice_ref([0, 0, 0])) // sizes + .add_child_data(value_data) + .build_unchecked() + }; + let array = ListViewArray::from(list_data); + + assert_eq!(array.len(), 3); + assert_eq!(array.value_size(0), 0); + assert_eq!(array.value_size(1), 0); + assert_eq!(array.value_size(2), 0); + + let values: Vec<_> = array + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![]), Some(vec![]), Some(vec![])]); + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 50c5699bac32..4b426e7eec80 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -72,6 +72,10 @@ mod byte_view_array; pub use byte_view_array::*; +mod list_view_array; + +pub use list_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -590,6 +594,12 @@ impl PartialEq for GenericListArray { } } +impl PartialEq for GenericListViewArray { + fn eq(&self, other: &Self) -> bool { + self.to_data().eq(&other.to_data()) + } +} + impl PartialEq for MapArray { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -678,6 +688,8 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, + DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef, + DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c1..6708da3d5dd6 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -21,7 +21,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; -use crate::{FixedSizeListArray, MapArray}; +use crate::{FixedSizeListArray, GenericListViewArray, MapArray}; use arrow_buffer::NullBuffer; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] @@ -143,7 +143,8 @@ pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; /// an iterator that returns Some(T) or None, that can be used on any MapArray pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; - +/// an iterator that returns Some(T) or None, that can be used on any ListArray +pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray>; #[cfg(test)] mod tests { use std::sync::Arc;