From cefcdafbb463ac8a8f80164435d76d01544b016e Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 31 Oct 2024 14:29:55 -0600 Subject: [PATCH 01/20] wip it kinda works but I just need to save state --- arrow-data/src/data.rs | 19 +- arrow-flight/src/encode.rs | 49 +- arrow-ipc/src/writer.rs | 895 ++++++++++++++++++++----------------- 3 files changed, 529 insertions(+), 434 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 8af2a91cf159..daf1acd4afe3 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -450,24 +450,27 @@ impl ArrayData { result += buffer_size; } BufferSpec::VariableWidth => { - let buffer_len: usize; - match self.data_type { + result += match &self.data_type { DataType::Utf8 | DataType::Binary => { let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0] ) as usize; + (offsets[self.len] - offsets[0]) as usize } DataType::LargeUtf8 | DataType::LargeBinary => { let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0]) as usize; + (offsets[self.len] - offsets[0]) as usize } - _ => { + // `FlatBufferSizeTracker::write_array_data` just writes these just as raw + // buffer data as opposed to using offsets or anything + DataType::BinaryView | DataType::Utf8View => self.buffers() + .iter() + .map(|buf| buf.len()) + .sum::(), + dt => { return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", - self.data_type + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {dt}", ))) } }; - result += buffer_len; } BufferSpec::BitMap => { let buffer_size = bit_util::ceil(self.len, 8); diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index acfbd9b53030..f9cbcb40e11c 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -573,20 +573,37 @@ fn split_batch_for_grpc_response( batch: RecordBatch, max_flight_data_size: usize, ) -> Vec { - let size = batch + let overhead_guess = arrow_ipc::writer::msg_overhead_guess(&batch); + + println!("🐈 overhead_guess: {overhead_guess}"); + + // Some of the tests pass in really small values for max_flight_data_size, so we need to make + // sure we still try our best and don't panic if the max size is too small to fit anything into + // and we need to make sure it's more than 0 or else we'll get a divide-by-0 error below + let max_batch_size = max_flight_data_size.saturating_sub(overhead_guess).max(1); + + let total_size = batch .columns() .iter() - .map(|col| col.get_buffer_memory_size()) + .map(|col| { + col.to_data() + .get_slice_memory_size() + .unwrap_or_else(|_| col.get_buffer_memory_size()) + }) .sum::(); - let n_batches = - (size / max_flight_data_size + usize::from(size % max_flight_data_size != 0)).max(1); - let rows_per_batch = (batch.num_rows() / n_batches).max(1); - let mut out = Vec::with_capacity(n_batches + 1); + let n_rows = batch.num_rows(); + + if total_size == 0 { + return vec![]; + } + let n_batches = (total_size / max_batch_size) + (total_size % max_batch_size).min(1); + let rows_per_batch = (n_rows / n_batches).max(1); + let mut out = Vec::with_capacity(n_batches); let mut offset = 0; - while offset < batch.num_rows() { - let length = (rows_per_batch).min(batch.num_rows() - offset); + while offset < n_rows { + let length = rows_per_batch.min(n_rows - offset); out.push(batch.slice(offset, length)); offset += length; @@ -1531,7 +1548,7 @@ mod tests { assert_eq!(a, b); } - #[test] + /*#[test] fn test_split_batch_for_grpc_response_sizes() { // 2000 8 byte entries into 2k pieces: 8 chunks of 250 rows verify_split(2000, 2 * 1024, vec![250, 250, 250, 250, 250, 250, 250, 250]); @@ -1570,7 +1587,7 @@ mod tests { assert_eq!(sizes, expected_sizes, "mismatch for {batch:?}"); assert_eq!(input_rows, output_rows, "mismatch for {batch:?}"); - } + }*/ // test sending record batches // test sending record batches with multiple different dictionaries @@ -1582,7 +1599,7 @@ mod tests { let s2 = StringArray::from_iter_values(std::iter::repeat("6bytes").take(1024)); let i2 = Int64Array::from_iter_values(0..1024); - let batch = RecordBatch::try_from_iter(vec![ + let batch = RecordBatch::try_from_iter([ ("s1", Arc::new(s1) as _), ("i1", Arc::new(i1) as _), ("s2", Arc::new(s2) as _), @@ -1590,14 +1607,16 @@ mod tests { ]) .unwrap(); - verify_encoded_split(batch, 112).await; + verify_encoded_split(batch, 88).await; + + panic!("I want output"); } #[tokio::test] async fn flight_data_size_uneven_variable_lengths() { // each row has a longer string than the last with increasing lengths 0 --> 1024 let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); - let batch = RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter([("data", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -1653,7 +1672,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 160).await; + verify_encoded_split(batch, 72).await; } #[tokio::test] @@ -1760,6 +1779,8 @@ mod tests { let actual_overage = actual_data_size.saturating_sub(max_flight_data_size); + println!("🐈 actual_overage: {actual_overage}"); + assert!( actual_overage <= allowed_overage, "encoded data[{i}]: actual size {actual_data_size}, \ diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b5c4dd95ed9f..1ed41638a678 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -25,7 +25,7 @@ use std::collections::HashMap; use std::io::{BufWriter, Write}; use std::sync::Arc; -use flatbuffers::FlatBufferBuilder; +use flatbuffers::{FlatBufferBuilder, Push, UnionWIPOffset, Vector, WIPOffset, SIZE_UOFFSET}; use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; @@ -38,7 +38,63 @@ use arrow_schema::*; use crate::compression::CompressionCodec; use crate::convert::IpcSchemaEncoder; -use crate::CONTINUATION_MARKER; +use crate::{ + BodyCompressionBuilder, BodyCompressionMethod, DictionaryBatchBuilder, MessageBuilder, + MessageHeader, RecordBatchBuilder, CONTINUATION_MARKER, +}; + +pub fn msg_overhead_guess(batch: &arrow_array::RecordBatch) -> usize { + use std::mem::size_of; + + // mostly taken calculations from arrow-ipc::writer::IpcDataGenerator::record_batch_to_bytes + + let opts = IpcWriteOptions::default(); + + const LEN_SIZE: usize = size_of::(); + + fn array_data_size(data: &ArrayData, opts: &IpcWriteOptions) -> usize { + // at the beginning of `write_array_data`, the null_buffer is added (if + // has_validity_bitmap) and that requires a count field as well + let null_buf_len = if has_validity_bitmap(data.data_type(), opts) { + LEN_SIZE + } else { + 0 + }; + // `write_buffer` writes 2 8-byte lengths per buffer per column (one for the compressed + // length, one for the uncompressed length) + let buf_lens = data.buffers().len() * LEN_SIZE * 2; + // Also added at the beginning of `write_array_data` + let field_node_len = LEN_SIZE * 2; + + let child_sizes = data + .child_data() + .iter() + .map(|child| array_data_size(child, opts)) + .sum::(); + + null_buf_len + buf_lens + field_node_len + child_sizes + } + + let column_sizes = batch + .columns() + .iter() + .map(|col| array_data_size(&col.to_data(), &opts)) + .sum::(); + + // `MessageBuilder::add_version` + let overhead = size_of::() + + // `MessageBuilder::add_header_type` + size_of::() + + // `MessageBuilder::add_bodyLength` + LEN_SIZE + + // `RecordBatchBuilder::add_length` is normally called with batch.num_rows() + LEN_SIZE + + opts.batch_compression_type.map_or(0, |ty| size_of_val(&ty) + size_of::()) + + column_sizes; + + // pad to the default alignment for writing + pad_to_alignment(opts.alignment, overhead) + overhead +} /// IPC write options used to control the behaviour of the [`IpcDataGenerator`] #[derive(Debug, Clone)] @@ -157,7 +213,7 @@ impl IpcWriteOptions { impl Default for IpcWriteOptions { fn default() -> Self { Self { - alignment: 64, + alignment: DEFAULT_ALIGNMENT, write_legacy_ipc_format: false, metadata_version: crate::MetadataVersion::V5, batch_compression_type: None, @@ -474,100 +530,10 @@ impl IpcDataGenerator { )?; } - let encoded_message = self.record_batch_to_bytes(batch, write_options)?; + let encoded_message = chunked_encoded_batch_bytes(batch, write_options)?; Ok((encoded_dictionaries, encoded_message)) } - /// Write a `RecordBatch` into two sets of bytes, one for the header (crate::Message) and the - /// other for the batch's data - fn record_batch_to_bytes( - &self, - batch: &RecordBatch, - write_options: &IpcWriteOptions, - ) -> Result { - let mut fbb = FlatBufferBuilder::new(); - - let mut nodes: Vec = vec![]; - let mut buffers: Vec = vec![]; - let mut arrow_data: Vec = vec![]; - let mut offset = 0; - - // get the type of compression - let batch_compression_type = write_options.batch_compression_type; - - let compression = batch_compression_type.map(|batch_compression_type| { - let mut c = crate::BodyCompressionBuilder::new(&mut fbb); - c.add_method(crate::BodyCompressionMethod::BUFFER); - c.add_codec(batch_compression_type); - c.finish() - }); - - let compression_codec: Option = - batch_compression_type.map(TryInto::try_into).transpose()?; - - let mut variadic_buffer_counts = vec![]; - - for array in batch.columns() { - let array_data = array.to_data(); - offset = write_array_data( - &array_data, - &mut buffers, - &mut arrow_data, - &mut nodes, - offset, - array.len(), - array.null_count(), - compression_codec, - write_options, - )?; - - append_variadic_buffer_counts(&mut variadic_buffer_counts, &array_data); - } - // pad the tail of body data - let len = arrow_data.len(); - let pad_len = pad_to_alignment(write_options.alignment, len); - arrow_data.extend_from_slice(&PADDING[..pad_len]); - - // write data - let buffers = fbb.create_vector(&buffers); - let nodes = fbb.create_vector(&nodes); - let variadic_buffer = if variadic_buffer_counts.is_empty() { - None - } else { - Some(fbb.create_vector(&variadic_buffer_counts)) - }; - - let root = { - let mut batch_builder = crate::RecordBatchBuilder::new(&mut fbb); - batch_builder.add_length(batch.num_rows() as i64); - batch_builder.add_nodes(nodes); - batch_builder.add_buffers(buffers); - if let Some(c) = compression { - batch_builder.add_compression(c); - } - - if let Some(v) = variadic_buffer { - batch_builder.add_variadicBufferCounts(v); - } - let b = batch_builder.finish(); - b.as_union_value() - }; - // create an crate::Message - let mut message = crate::MessageBuilder::new(&mut fbb); - message.add_version(write_options.metadata_version); - message.add_header_type(crate::MessageHeader::RecordBatch); - message.add_bodyLength(arrow_data.len() as i64); - message.add_header(root); - let root = message.finish(); - fbb.finish(root, None); - let finished_data = fbb.finished_data(); - - Ok(EncodedData { - ipc_message: finished_data.to_vec(), - arrow_data, - }) - } - /// Write dictionary values into two sets of bytes, one for the header (crate::Message) and the /// other for the data fn dictionary_batch_to_bytes( @@ -576,92 +542,21 @@ impl IpcDataGenerator { array_data: &ArrayData, write_options: &IpcWriteOptions, ) -> Result { - let mut fbb = FlatBufferBuilder::new(); - - let mut nodes: Vec = vec![]; - let mut buffers: Vec = vec![]; - let mut arrow_data: Vec = vec![]; - - // get the type of compression - let batch_compression_type = write_options.batch_compression_type; - - let compression = batch_compression_type.map(|batch_compression_type| { - let mut c = crate::BodyCompressionBuilder::new(&mut fbb); - c.add_method(crate::BodyCompressionMethod::BUFFER); - c.add_codec(batch_compression_type); - c.finish() - }); - - let compression_codec: Option = batch_compression_type - .map(|batch_compression_type| batch_compression_type.try_into()) - .transpose()?; - - write_array_data( - array_data, - &mut buffers, - &mut arrow_data, - &mut nodes, - 0, - array_data.len(), - array_data.null_count(), - compression_codec, + encode_array_datas( + // TODO: We can abstract this clone away, right? + [array_data.clone()], + array_data.len() as i64, + |fbb, offset| { + fbb.with_builder(DictionaryBatchBuilder::new, |mut bldr| { + bldr.add_data(DictionaryBatchBuilder::add_id, dict_id); + + bldr.builder.add_data(offset); + bldr.builder.finish().as_union_value() + }) + }, + MessageHeader::DictionaryBatch, write_options, - )?; - - let mut variadic_buffer_counts = vec![]; - append_variadic_buffer_counts(&mut variadic_buffer_counts, array_data); - - // pad the tail of body data - let len = arrow_data.len(); - let pad_len = pad_to_alignment(write_options.alignment, len); - arrow_data.extend_from_slice(&PADDING[..pad_len]); - - // write data - let buffers = fbb.create_vector(&buffers); - let nodes = fbb.create_vector(&nodes); - let variadic_buffer = if variadic_buffer_counts.is_empty() { - None - } else { - Some(fbb.create_vector(&variadic_buffer_counts)) - }; - - let root = { - let mut batch_builder = crate::RecordBatchBuilder::new(&mut fbb); - batch_builder.add_length(array_data.len() as i64); - batch_builder.add_nodes(nodes); - batch_builder.add_buffers(buffers); - if let Some(c) = compression { - batch_builder.add_compression(c); - } - if let Some(v) = variadic_buffer { - batch_builder.add_variadicBufferCounts(v); - } - batch_builder.finish() - }; - - let root = { - let mut batch_builder = crate::DictionaryBatchBuilder::new(&mut fbb); - batch_builder.add_id(dict_id); - batch_builder.add_data(root); - batch_builder.finish().as_union_value() - }; - - let root = { - let mut message_builder = crate::MessageBuilder::new(&mut fbb); - message_builder.add_version(write_options.metadata_version); - message_builder.add_header_type(crate::MessageHeader::DictionaryBatch); - message_builder.add_bodyLength(arrow_data.len() as i64); - message_builder.add_header(root); - message_builder.finish() - }; - - fbb.finish(root, None); - let finished_data = fbb.finished_data(); - - Ok(EncodedData { - ipc_message: finished_data.to_vec(), - arrow_data, - }) + ) } } @@ -1279,6 +1174,7 @@ pub struct EncodedData { /// Arrow buffers to be written, should be an empty vec for schema messages pub arrow_data: Vec, } + /// Write a message's IPC data and buffers, returning metadata and buffer data lengths written pub fn write_message( mut writer: W, @@ -1467,275 +1363,450 @@ fn get_list_array_buffers(data: &ArrayData) -> (Buffer, Arra (offsets, child_data) } -/// Write array data to a vector of bytes -#[allow(clippy::too_many_arguments)] -fn write_array_data( - array_data: &ArrayData, - buffers: &mut Vec, - arrow_data: &mut Vec, - nodes: &mut Vec, - offset: i64, - num_rows: usize, - null_count: usize, - compression_codec: Option, +const DEFAULT_ALIGNMENT: u8 = 64; +const PADDING: [u8; DEFAULT_ALIGNMENT as usize] = [0; DEFAULT_ALIGNMENT as usize]; + +/// Calculate an alignment boundary and return the number of bytes needed to pad to the alignment boundary +#[inline] +fn pad_to_alignment(alignment: u8, len: usize) -> usize { + let a = usize::from(alignment - 1); + ((len + a) & !a) - len +} + +pub fn create_vector<'fbb, T>( + overhead: &mut usize, + fbb: &mut FlatBufferBuilder<'fbb>, + items: &[T], +) -> WIPOffset> +where + T: Push, +{ + *overhead += SIZE_UOFFSET; + fbb.create_vector(items) +} + +fn chunked_encoded_batch_bytes( + batch: &RecordBatch, write_options: &IpcWriteOptions, -) -> Result { - let mut offset = offset; - if !matches!(array_data.data_type(), DataType::Null) { - nodes.push(crate::FieldNode::new(num_rows as i64, null_count as i64)); - } else { - // NullArray's null_count equals to len, but the `null_count` passed in is from ArrayData - // where null_count is always 0. - nodes.push(crate::FieldNode::new(num_rows as i64, num_rows as i64)); - } - if has_validity_bitmap(array_data.data_type(), write_options) { - // write null buffer if exists - let null_buffer = match array_data.nulls() { - None => { - // create a buffer and fill it with valid bits - let num_bytes = bit_util::ceil(num_rows, 8); - let buffer = MutableBuffer::new(num_bytes); - let buffer = buffer.with_bitset(num_bytes, true); - buffer.into() - } - Some(buffer) => buffer.inner().sliced(), - }; +) -> Result { + encode_array_datas( + batch.columns().iter().map(ArrayRef::to_data), + batch.num_rows() as i64, + |_, offset| offset.as_union_value(), + MessageHeader::RecordBatch, + write_options, + ) +} + +fn dry_run_header_size( + arr_datas: impl IntoIterator, + n_rows: i64, + encode_root: impl FnOnce( + &mut FlatBufferSizeTracker, + WIPOffset, + ) -> WIPOffset, + header_type: MessageHeader, + write_options: &IpcWriteOptions +) -> Result { +} - offset = write_buffer( - null_buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, +fn encode_array_datas( + arr_datas: impl IntoIterator, + n_rows: i64, + encode_root: impl FnOnce( + &mut FlatBufferSizeTracker, + WIPOffset, + ) -> WIPOffset, + header_type: MessageHeader, + write_options: &IpcWriteOptions, +) -> Result { + let mut fbb = FlatBufferSizeTracker::default(); + + let batch_compression_type = write_options.batch_compression_type; + + let compression = batch_compression_type.map(|compression_type| { + fbb.with_builder(BodyCompressionBuilder::new, |mut builder| { + builder.add_data( + BodyCompressionBuilder::add_method, + BodyCompressionMethod::BUFFER, + ); + builder.add_data(BodyCompressionBuilder::add_codec, compression_type); + builder.builder.finish() + }) + }); + + let mut variadic_buffer_counts = Vec::::default(); + let mut offset = 0; + + // TODO: split these up here to only fit as many as can be fit + // - get `fbb.fbb.unfinished_data` after writing a fake piece of data for `buffers`. that + // determines the real overhead + // - split up the array datas based on how much space they take up alone + for array in arr_datas { + fbb.write_array_data( + &array, + &mut offset, + array.len(), + array.null_count(), + write_options, )?; + + append_variadic_buffer_counts(&mut variadic_buffer_counts, &array); } - let data_type = array_data.data_type(); - if matches!(data_type, DataType::Binary | DataType::Utf8) { - let (offsets, values) = get_byte_array_buffers::(array_data); - for buffer in [offsets, values] { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; - } - } else if matches!(data_type, DataType::BinaryView | DataType::Utf8View) { - // Slicing the views buffer is safe and easy, - // but pruning unneeded data buffers is much more nuanced since it's complicated to prove that no views reference the pruned buffers + // pad the tail of the body data + let pad_len = pad_to_alignment(write_options.alignment, fbb.arrow_data.len()); + fbb.arrow_data.extend_from_slice(&PADDING[..pad_len]); + + let buffers = create_vector(&mut fbb.overhead, &mut fbb.fbb, &fbb.buffers); + let nodes = create_vector(&mut fbb.overhead, &mut fbb.fbb, &fbb.nodes); + let variadic_buffer = (!variadic_buffer_counts.is_empty()) + .then(|| create_vector(&mut fbb.overhead, &mut fbb.fbb, &variadic_buffer_counts)); + + let root = fbb.with_builder(RecordBatchBuilder::new, |mut bldr| { + // Now, this is the only piece of data for this builder that didn't already come from + // flatbuffers - the rest in this closure are produced from `create_vector` or a builder's + // `finish()`, so their memory is already accounted for. That's why we use the escape hatch + // for everything else. // - // Current implementation just serialize the raw arrays as given and not try to optimize anything. - // If users wants to "compact" the arrays prior to sending them over IPC, - // they should consider the gc API suggested in #5513 - for buffer in array_data.buffers() { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; + // See the comment on `add_data` for an explanation of why this is needed. + bldr.add_data(RecordBatchBuilder::add_length, n_rows); + + bldr.builder.add_nodes(nodes); + bldr.builder.add_buffers(buffers); + if let Some(c) = compression { + bldr.builder.add_compression(c); } - } else if matches!(data_type, DataType::LargeBinary | DataType::LargeUtf8) { - let (offsets, values) = get_byte_array_buffers::(array_data); - for buffer in [offsets, values] { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; + if let Some(v) = variadic_buffer { + bldr.builder.add_variadicBufferCounts(v); } - } else if DataType::is_numeric(data_type) - || DataType::is_temporal(data_type) - || matches!( - array_data.data_type(), - DataType::FixedSizeBinary(_) | DataType::Dictionary(_, _) - ) + + bldr.builder.finish() + }); + + // Unions are stored as a tag and offset, which (afaict) are both the same size, and we're + // generally following a policy of accounting for memory as soon as possible, so we need to + // check that here (since `encode_root` returns a Union offset) + fbb.overhead += SIZE_UOFFSET * 2; + let root = encode_root(&mut fbb, root); + + let arrow_len = fbb.arrow_data.len() as i64; + let msg = fbb.with_builder(MessageBuilder::new, |mut bldr| { + bldr.add_data(MessageBuilder::add_version, write_options.metadata_version); + bldr.add_data(MessageBuilder::add_header_type, header_type); + bldr.add_data(MessageBuilder::add_bodyLength, arrow_len); + + // `root` was already tracked when it was turned into a union + bldr.builder.add_header(root); + bldr.builder.finish() + }); + + fbb.fbb.finish(msg, None); + let ipc_message = fbb.fbb.finished_data().to_vec(); + + println!("🐈 ipc_message size: {}", ipc_message.len()); + + Ok(EncodedData { + ipc_message, + arrow_data: fbb.arrow_data, + }) +} + +#[derive(Default)] +struct FlatBufferSizeTracker<'fbb> { + // tracks the amount of data that is going to be encoded into the final FlatBuffer (or is + // already encoded) but is not: + // 1. Already tracked by a different field here. E.g. We know that each node in `self.nodes` + // must be encoded into the final thing, so we don't bother tracking the space that those + // will take up in `overhead`, and instead just check `nodes.len() * size_of()` + // instead. + // 2. raw ArrayData that will be encoded into the final product. This overhead tracking is done + // to figure out how many RecordBatches we can encode into each FlatBuffer, so we just need + // to keep track of everything besides those batches/ArrayData. + pub overhead: usize, + // the builder and backing flatbuffer that we use to write the arrow data into. + fbb: FlatBufferBuilder<'fbb>, + // tracks the data in `arrow_data` - `buffers` contains the offsets and length of different + // buffers encoded within the big chunk that is `arrow_data`. + buffers: Vec, + // the raw array data that we need to send across the wire + arrow_data: Vec, + nodes: Vec, +} + +impl<'fbb> FlatBufferSizeTracker<'fbb> { + fn with_builder<'s, B, BFn, F, O>(&'s mut self, b_fn: BFn, f: F) -> O + where + BFn: Fn(&'s mut FlatBufferBuilder<'fbb>) -> B, + F: Fn(BuilderTracker<'s, B>) -> O, { - // Truncate values - assert_eq!(array_data.buffers().len(), 1); - - let buffer = &array_data.buffers()[0]; - let layout = layout(data_type); - let spec = &layout.buffers[0]; - - let byte_width = get_buffer_element_width(spec); - let min_length = array_data.len() * byte_width; - let buffer_slice = if buffer_need_truncate(array_data.offset(), buffer, spec, min_length) { - let byte_offset = array_data.offset() * byte_width; - let buffer_length = min(min_length, buffer.len() - byte_offset); - &buffer.as_slice()[byte_offset..(byte_offset + buffer_length)] - } else { - buffer.as_slice() + let builder = b_fn(&mut self.fbb); + let tracker = BuilderTracker { + overhead: &mut self.overhead, + builder, }; - offset = write_buffer( - buffer_slice, - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; - } else if matches!(data_type, DataType::Boolean) { - // Bools are special because the payload (= 1 bit) is smaller than the physical container elements (= bytes). - // The array data may not start at the physical boundary of the underlying buffer, so we need to shift bits around. - assert_eq!(array_data.buffers().len(), 1); - - let buffer = &array_data.buffers()[0]; - let buffer = buffer.bit_slice(array_data.offset(), array_data.len()); - offset = write_buffer( - &buffer, - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; - } else if matches!( - data_type, - DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) - ) { - assert_eq!(array_data.buffers().len(), 1); - assert_eq!(array_data.child_data().len(), 1); - - // Truncate offsets and the child data to avoid writing unnecessary data - let (offsets, sliced_child_data) = match data_type { - DataType::List(_) => get_list_array_buffers::(array_data), - DataType::Map(_, _) => get_list_array_buffers::(array_data), - DataType::LargeList(_) => get_list_array_buffers::(array_data), - _ => unreachable!(), - }; - offset = write_buffer( - offsets.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - write_options.alignment, - )?; - offset = write_array_data( - &sliced_child_data, - buffers, - arrow_data, - nodes, - offset, - sliced_child_data.len(), - sliced_child_data.null_count(), - compression_codec, - write_options, - )?; - return Ok(offset); - } else { - for buffer in array_data.buffers() { - offset = write_buffer( - buffer, - buffers, - arrow_data, + + f(tracker) + } + + pub fn total_overhead(&self) -> usize { + self.overhead + + (self.nodes.len() * size_of::<::Output>()) + + (self.buffers.len() * size_of::<::Output>()) + } + + fn write_array_data( + &mut self, + array_data: &ArrayData, + offset: &mut i64, + num_rows: usize, + null_count: usize, + write_options: &IpcWriteOptions, + ) -> Result<(), ArrowError> { + let compression_codec: Option = write_options + .batch_compression_type + .map(TryInto::try_into) + .transpose()?; + + if !matches!(array_data.data_type(), DataType::Null) { + self.nodes + .push(crate::FieldNode::new(num_rows as i64, null_count as i64)); + } else { + // NullArray's null_count equals to len, but the `null_count` passed in is from ArrayData + // where null_count is always 0. + self.nodes + .push(crate::FieldNode::new(num_rows as i64, num_rows as i64)); + } + + if has_validity_bitmap(array_data.data_type(), write_options) { + // write null buffer if exists + let null_buffer = match array_data.nulls() { + None => { + // create a buffer and fill it with valid bits + let num_bytes = bit_util::ceil(num_rows, 8); + MutableBuffer::new(num_bytes) + .with_bitset(num_bytes, true) + .into() + } + Some(buffer) => buffer.inner().sliced(), + }; + + self.write_buffer( + null_buffer.as_slice(), offset, compression_codec, write_options.alignment, )?; } - } - match array_data.data_type() { - DataType::Dictionary(_, _) => {} - DataType::RunEndEncoded(_, _) => { - // unslice the run encoded array. - let arr = unslice_run_array(array_data.clone())?; - // recursively write out nested structures - for data_ref in arr.child_data() { - // write the nested data (e.g list data) - offset = write_array_data( - data_ref, - buffers, - arrow_data, - nodes, + let mut write_byte_array_byffers = |(offsets, values): (Buffer, Buffer)| { + for buffer in [offsets, values] { + self.write_buffer( + buffer.as_slice(), + offset, + compression_codec, + write_options.alignment, + )?; + } + Ok::<_, ArrowError>(()) + }; + + match array_data.data_type() { + DataType::Binary | DataType::Utf8 => { + write_byte_array_byffers(get_byte_array_buffers::(array_data))? + } + DataType::LargeBinary | DataType::LargeUtf8 => { + write_byte_array_byffers(get_byte_array_buffers::(array_data))? + } + DataType::BinaryView | DataType::Utf8View => { + // Slicing the views buffer is safe and easy, + // but pruning unneeded data buffers is much more nuanced since it's complicated + // to prove that no views reference the pruned buffers + // + // Current implementation just serialize the raw arrays as given and not try to optimize anything. + // If users wants to "compact" the arrays prior to sending them over IPC, + // they should consider the gc API suggested in #5513 + for buffer in array_data.buffers() { + self.write_buffer( + buffer.as_slice(), + offset, + compression_codec, + write_options.alignment, + )?; + } + } + dt if DataType::is_numeric(dt) + || DataType::is_temporal(dt) + || matches!( + dt, + DataType::FixedSizeBinary(_) | DataType::Dictionary(_, _) + ) => + { + // Truncate values + let [buffer] = array_data.buffers() else { + panic!("Temporal, Numeric, FixedSizeBinary, and Dictionary data types must contain only one buffer"); + }; + + let layout = layout(dt); + let spec = &layout.buffers[0]; + + let byte_width = get_buffer_element_width(spec); + let min_length = array_data.len() * byte_width; + let mut buffer_slice = buffer.as_slice(); + + if buffer_need_truncate(array_data.offset(), buffer, spec, min_length) { + let byte_offset = array_data.offset() * byte_width; + let buffer_length = min(min_length, buffer.len() - byte_offset); + buffer_slice = &buffer_slice[byte_offset..(byte_offset + buffer_length)]; + } + + self.write_buffer( + buffer_slice, + offset, + compression_codec, + write_options.alignment, + )?; + } + DataType::Boolean => { + // Bools are special because the payload (= 1 bit) is smaller than the physical container elements (= bytes). + // The array data may not start at the physical boundary of the underlying buffer, + // so we need to shift bits around. + let [single_buf] = array_data.buffers() else { + panic!("ArrayData of type Boolean should only contain 1 buffer"); + }; + + let buffer = &single_buf.bit_slice(array_data.offset(), array_data.len()); + self.write_buffer(buffer, offset, compression_codec, write_options.alignment)?; + } + dt @ (DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _)) => { + assert_eq!(array_data.buffers().len(), 1); + assert_eq!(array_data.child_data().len(), 1); + + // Truncate offsets and the child data to avoid writing unnecessary data + let (offsets, sliced_child_data) = match dt { + DataType::List(_) | DataType::Map(_, _) => { + get_list_array_buffers::(array_data) + } + DataType::LargeList(_) => get_list_array_buffers::(array_data), + _ => unreachable!(), + }; + self.write_buffer( + offsets.as_slice(), offset, - data_ref.len(), - data_ref.null_count(), compression_codec, + write_options.alignment, + )?; + self.write_array_data( + &sliced_child_data, + offset, + sliced_child_data.len(), + sliced_child_data.null_count(), write_options, )?; + println!("🐈 total_overhead: {}", self.total_overhead()); + return Ok(()); + } + _ => { + for buffer in array_data.buffers() { + self.write_buffer(buffer, offset, compression_codec, write_options.alignment)?; + } } } - _ => { - // recursively write out nested structures - for data_ref in array_data.child_data() { - // write the nested data (e.g list data) - offset = write_array_data( + + let mut write_arr = |arr: &ArrayData| { + for data_ref in arr.child_data() { + self.write_array_data( data_ref, - buffers, - arrow_data, - nodes, offset, data_ref.len(), data_ref.null_count(), - compression_codec, write_options, )?; } - } + Ok::<_, ArrowError>(()) + }; + + let res = match array_data.data_type() { + DataType::Dictionary(_, _) => Ok(()), + // unslice the run encoded array. + DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data.clone())?), + // recursively write out nested structures + _ => write_arr(array_data), + }; + println!("🐈 total_overhead: {}", self.total_overhead()); + res } - Ok(offset) -} -/// Write a buffer into `arrow_data`, a vector of bytes, and adds its -/// [`crate::Buffer`] to `buffers`. Returns the new offset in `arrow_data` -/// -/// -/// From -/// Each constituent buffer is first compressed with the indicated -/// compressor, and then written with the uncompressed length in the first 8 -/// bytes as a 64-bit little-endian signed integer followed by the compressed -/// buffer bytes (and then padding as required by the protocol). The -/// uncompressed length may be set to -1 to indicate that the data that -/// follows is not compressed, which can be useful for cases where -/// compression does not yield appreciable savings. -fn write_buffer( - buffer: &[u8], // input - buffers: &mut Vec, // output buffer descriptors - arrow_data: &mut Vec, // output stream - offset: i64, // current output stream offset - compression_codec: Option, - alignment: u8, -) -> Result { - let len: i64 = match compression_codec { - Some(compressor) => compressor.compress_to_vec(buffer, arrow_data)?, - None => { - arrow_data.extend_from_slice(buffer); - buffer.len() + /// Write a buffer into `arrow_data`, a vector of bytes, and adds its + /// [`crate::Buffer`] to `buffers`. Modifies the offset passed in to respect the new value. + /// + /// From + /// Each constituent buffer is first compressed with the indicated + /// compressor, and then written with the uncompressed length in the first 8 + /// bytes as a 64-bit little-endian signed integer followed by the compressed + /// buffer bytes (and then padding as required by the protocol). The + /// uncompressed length may be set to -1 to indicate that the data that + /// follows is not compressed, which can be useful for cases where + /// compression does not yield appreciable savings. + fn write_buffer( + &mut self, + // input + buffer: &[u8], + // current output stream offset + offset: &mut i64, + compression_codec: Option, + alignment: u8, + ) -> Result<(), ArrowError> { + let len: i64 = match compression_codec { + Some(compressor) => compressor.compress_to_vec(buffer, &mut self.arrow_data)?, + None => { + self.arrow_data.extend_from_slice(buffer); + buffer.len() + } } + .try_into() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Could not convert compressed size to i64: {e}" + )) + })?; + + // make new index entry + self.buffers.push(crate::Buffer::new(*offset, len)); + // padding and make offset aligned + let pad_len = pad_to_alignment(alignment, len as usize); + self.arrow_data.extend_from_slice(&PADDING[..pad_len]); + + // We don't add pad_len to self.overhead here mainly because it feels over-zealous to do + // so - Once compression is incorporated into this sort of schema, where you're trying to + // count bits to fit as much data into these fields as possible, + + *offset += len + (pad_len as i64); + Ok(()) } - .try_into() - .map_err(|e| { - ArrowError::InvalidArgumentError(format!("Could not convert compressed size to i64: {e}")) - })?; - - // make new index entry - buffers.push(crate::Buffer::new(offset, len)); - // padding and make offset aligned - let pad_len = pad_to_alignment(alignment, len as usize); - arrow_data.extend_from_slice(&PADDING[..pad_len]); - - Ok(offset + len + (pad_len as i64)) } -const PADDING: [u8; 64] = [0; 64]; +struct BuilderTracker<'tracker, B> { + overhead: &'tracker mut usize, + pub builder: B, +} -/// Calculate an alignment boundary and return the number of bytes needed to pad to the alignment boundary -#[inline] -fn pad_to_alignment(alignment: u8, len: usize) -> usize { - let a = usize::from(alignment - 1); - ((len + a) & !a) - len +impl<'tracker, B> BuilderTracker<'tracker, B> { + // This is a method you have to be careful about if you want exact accounting - Specifically, + // there's no way (as far as I can figure out, at least) for this method to not count overhead + // when the data being added was already pulled from the flatbuffer it's being added to. For + // example, if we call `create_vector` on our FlatBufferSizeTracker, then add that to a `B` + // using this method, the `Vector`'s size has already been accounted for, so we don't want to + // track it again here. Same thing if we produce a value from a Builder's `finish()` method, + // then add it with another builder. + // + // Now, we could to like `D: FlatBufferSized` and then impl that trait for everything and add + // specializations for `flatbuffers::WIPOffset` and such... IF we had specialization. Alas, any + // sort of blanket-impl-plus-caveats solution requires specialization. + fn add_data(&mut self, f: F, data: D) { + f(&mut self.builder, data); + *self.overhead += std::mem::size_of::(); + } } #[cfg(test)] From 6411e2acf03b2c7de26dff28aa465b7ba00d764b Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Fri, 1 Nov 2024 14:26:36 -0600 Subject: [PATCH 02/20] Got normal encoding working but dictionaries are still failing --- arrow-data/src/data.rs | 72 +- arrow-flight/src/encode.rs | 158 ++--- arrow-flight/src/utils.rs | 34 +- .../integration_test.rs | 13 +- .../integration_test.rs | 16 +- arrow-ipc/src/reader.rs | 2 +- arrow-ipc/src/writer.rs | 634 +++++++++--------- 7 files changed, 472 insertions(+), 457 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index daf1acd4afe3..245ea0d4009e 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -452,12 +452,12 @@ impl ArrayData { BufferSpec::VariableWidth => { result += match &self.data_type { DataType::Utf8 | DataType::Binary => { - let offsets = self.typed_offsets::()?; - (offsets[self.len] - offsets[0]) as usize + let (buf1, buf2) = self.get_byte_array_buffers::(); + buf1.len() + buf2.len() } DataType::LargeUtf8 | DataType::LargeBinary => { - let offsets = self.typed_offsets::()?; - (offsets[self.len] - offsets[0]) as usize + let (buf1, buf2) = self.get_byte_array_buffers::(); + buf1.len() + buf2.len() } // `FlatBufferSizeTracker::write_array_data` just writes these just as raw // buffer data as opposed to using offsets or anything @@ -492,6 +492,64 @@ impl ArrayData { Ok(result) } + /// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` + /// + /// In particular, this handles re-encoding the offsets if they don't start at `0`, + /// slicing the values buffer as appropriate. This helps reduce the encoded + /// size of sliced arrays, as values that have been sliced away are not encoded + pub fn get_byte_array_buffers(&self) -> (Buffer, Buffer) { + if self.is_empty() { + return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); + } + + // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice + // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice + // to only get the last two, they'll be shifted to be [0, 2], since those would be the + // offsets for the last two values in this shifted slice). + let (offsets, original_start_offset, len) = self.reencode_offsets::(); + let values = self.buffers()[1].slice_with_length(original_start_offset, len); + (offsets, values) + } + + /// Common functionality for re-encoding offsets. Returns the new offsets as well as + /// original start offset and length for use in slicing child data. + /// + /// # Panics + /// + /// Will panic if you call this on an `ArrayData` that does not have a buffer of offsets as the + /// very first buffer (like most variable length arrays) + pub fn reencode_offsets(&self) -> (Buffer, usize, usize) { + // first we want to see: what is the offset of this `ArrayData` into the buffer (which is a + // buffer of offsets into the buffer of data) + let orig_offset = self.offset(); + let o_size = mem::size_of::(); + // and then slice the buffer to only get the part we care about, keeping in mind the size + // of the type that we're treating it as (since it's just encoded as a bunch of u8s as the + // base level) + let mut offsets = + self.buffers()[0].slice_with_length(orig_offset * o_size, self.len * o_size); + + // then we have to turn it into a typed slice that we can read and manipulate below if + // needed + let offsets_slice = offsets.typed_data::(); + + // and now we can see what the very first offset and the very last offset is + let start_offset_o = offsets_slice.first().unwrap(); + let start_offset = start_offset_o.as_usize(); + let end_offset = offsets_slice.last().unwrap().as_usize(); + + // if the start offset is just 0, i.e. it points to the very beginning of the values of + // this `ArrayData`, then we don't need to shift anything to be a 'correct' offset 'cause + // all the offsets in this buffer are already offset by 0. + // But if it's not 0, then we need to shift them all so that the offsets don't start at + // some weird value. + if start_offset != 0 { + offsets = offsets_slice.iter().map(|x| *x - *start_offset_o).collect(); + }; + + (offsets, start_offset, end_offset - start_offset) + } + /// Returns the total number of bytes of memory occupied /// physically by this [`ArrayData`] and all its [`Buffer`]s and /// children. (See also diagram on [`ArrayData`]). @@ -526,7 +584,9 @@ impl ArrayData { /// /// Panics if `offset + length > self.len()`. pub fn slice(&self, offset: usize, length: usize) -> ArrayData { - assert!((offset + length) <= self.len()); + if (offset + length) > self.len() { + panic!("Attempting to slice an array with offset {offset}, len {length} when self.len is {}", self.len); + } if let DataType::Struct(_) = self.data_type() { // Slice into children @@ -540,7 +600,7 @@ impl ArrayData { child_data: self .child_data() .iter() - .map(|data| data.slice(offset, length)) + .map(|data| data.slice(offset, length.min(data.len()))) .collect(), nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), }; diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index f9cbcb40e11c..ef849b1cdc01 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -338,12 +338,13 @@ impl FlightDataEncoder { DictionaryHandling::Hydrate => hydrate_dictionaries(&batch, schema)?, }; - for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { - let (flight_dictionaries, flight_batch) = self.encoder.encode_batch(&batch)?; + // ASK: Is there some sort of order that these need to go through in? Does this mess it up? + let (flight_dictionaries, flight_batches) = self + .encoder + .encode_batch(&batch, self.max_flight_data_size)?; - self.queue_messages(flight_dictionaries); - self.queue_message(flight_batch); - } + self.queue_messages(flight_dictionaries); + self.queue_messages(flight_batches); Ok(()) } @@ -563,55 +564,6 @@ fn prepare_schema_for_flight( Schema::new(fields).with_metadata(schema.metadata().clone()) } -/// Split [`RecordBatch`] so it hopefully fits into a gRPC response. -/// -/// Data is zero-copy sliced into batches. -/// -/// Note: this method does not take into account already sliced -/// arrays: -fn split_batch_for_grpc_response( - batch: RecordBatch, - max_flight_data_size: usize, -) -> Vec { - let overhead_guess = arrow_ipc::writer::msg_overhead_guess(&batch); - - println!("🐈 overhead_guess: {overhead_guess}"); - - // Some of the tests pass in really small values for max_flight_data_size, so we need to make - // sure we still try our best and don't panic if the max size is too small to fit anything into - // and we need to make sure it's more than 0 or else we'll get a divide-by-0 error below - let max_batch_size = max_flight_data_size.saturating_sub(overhead_guess).max(1); - - let total_size = batch - .columns() - .iter() - .map(|col| { - col.to_data() - .get_slice_memory_size() - .unwrap_or_else(|_| col.get_buffer_memory_size()) - }) - .sum::(); - - let n_rows = batch.num_rows(); - - if total_size == 0 { - return vec![]; - } - let n_batches = (total_size / max_batch_size) + (total_size % max_batch_size).min(1); - let rows_per_batch = (n_rows / n_batches).max(1); - let mut out = Vec::with_capacity(n_batches); - - let mut offset = 0; - while offset < n_rows { - let length = rows_per_batch.min(n_rows - offset); - out.push(batch.slice(offset, length)); - - offset += length; - } - - out -} - /// The data needed to encode a stream of flight data, holding on to /// shared Dictionaries. /// @@ -644,15 +596,22 @@ impl FlightIpcEncoder { /// Convert a `RecordBatch` to a Vec of `FlightData` representing /// dictionaries and a `FlightData` representing the batch - fn encode_batch(&mut self, batch: &RecordBatch) -> Result<(Vec, FlightData)> { - let (encoded_dictionaries, encoded_batch) = - self.data_gen - .encoded_batch(batch, &mut self.dictionary_tracker, &self.options)?; + fn encode_batch( + &mut self, + batch: &RecordBatch, + max_flight_data_size: usize, + ) -> Result<(Vec, Vec)> { + let (encoded_dictionaries, encoded_batches) = self.data_gen.encoded_batch_with_size( + batch, + &mut self.dictionary_tracker, + &self.options, + max_flight_data_size, + )?; let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); - let flight_batch = encoded_batch.into(); + let flight_batches = encoded_batches.into_iter().map(Into::into).collect(); - Ok((flight_dictionaries, flight_batch)) + Ok((flight_dictionaries, flight_batches)) } } @@ -701,14 +660,17 @@ fn hydrate_dictionary(array: &ArrayRef, data_type: &DataType) -> Result fn test_encode_flight_data() { @@ -728,12 +693,13 @@ mod tests { .expect("cannot create record batch"); let schema = batch.schema_ref(); - let (_, baseline_flight_batch) = make_flight_data(&batch, &options); + let (_, baseline_flight_batch) = utils::flight_data_from_arrow_batch(&batch, &options); let big_batch = batch.slice(0, batch.num_rows() - 1); let optimized_big_batch = hydrate_dictionaries(&big_batch, Arc::clone(schema)).expect("failed to optimize"); - let (_, optimized_big_flight_batch) = make_flight_data(&optimized_big_batch, &options); + let (_, optimized_big_flight_batch) = + utils::flight_data_from_arrow_batch(&optimized_big_batch, &options); assert_eq!( baseline_flight_batch.data_body.len(), @@ -743,7 +709,8 @@ mod tests { let small_batch = batch.slice(0, 1); let optimized_small_batch = hydrate_dictionaries(&small_batch, Arc::clone(schema)).expect("failed to optimize"); - let (_, optimized_small_flight_batch) = make_flight_data(&optimized_small_batch, &options); + let (_, optimized_small_flight_batch) = + utils::flight_data_from_arrow_batch(&optimized_small_batch, &options); assert!( baseline_flight_batch.data_body.len() > optimized_small_flight_batch.data_body.len() @@ -1502,34 +1469,23 @@ mod tests { hydrate_dictionaries(&batch, batch.schema()).expect("failed to optimize"); } - pub fn make_flight_data( - batch: &RecordBatch, - options: &IpcWriteOptions, - ) -> (Vec, FlightData) { - let data_gen = IpcDataGenerator::default(); - let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); - - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, options) - .expect("DictionaryTracker configured above to not error on replacement"); - - let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); - let flight_batch = encoded_batch.into(); - - (flight_dictionaries, flight_batch) - } - #[test] fn test_split_batch_for_grpc_response() { let max_flight_data_size = 1024; + let write_opts = IpcWriteOptions::default(); + let mut dict_tracker = DictionaryTracker::new(false); + let gen = IpcDataGenerator {}; // no split let c = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); - let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); + let split = gen + .encoded_batch_with_size(&batch, &mut dict_tracker, &write_opts, max_flight_data_size) + .unwrap() + .1; assert_eq!(split.len(), 1); - assert_eq!(batch, split[0]); + // assert_eq!(batch, split[0]); // split once let n_rows = max_flight_data_size + 1; @@ -1537,17 +1493,27 @@ mod tests { let c = UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); - let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); + let split = gen + .encoded_batch_with_size(&batch, &mut dict_tracker, &write_opts, max_flight_data_size) + .unwrap() + .1; assert_eq!(split.len(), 3); - assert_eq!( + /*assert_eq!( split.iter().map(|batch| batch.num_rows()).sum::(), n_rows ); let a = pretty_format_batches(&split).unwrap().to_string(); let b = pretty_format_batches(&[batch]).unwrap().to_string(); - assert_eq!(a, b); + assert_eq!(a, b);*/ } + // TESTS TO DO: + // - Make sure batches with Utf8View/BinaryView are properly accounted for and don't overflow + // too much + // - Modify `test_split_batch_for_grpc_response` to read the `EncodedData` back into + // `RecordBatches` and compare those + // - Ensure No record batches overflow their alloted size. + /*#[test] fn test_split_batch_for_grpc_response_sizes() { // 2000 8 byte entries into 2k pieces: 8 chunks of 250 rows @@ -1607,9 +1573,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split(batch, 88).await; - - panic!("I want output"); + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1620,7 +1584,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4304).await; + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1656,7 +1620,7 @@ mod tests { // 5k over limit (which is 2x larger than limit of 5k) // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5800).await; + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1672,7 +1636,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 72).await; + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1686,7 +1650,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 3328).await; + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1700,7 +1664,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5280).await; + verify_encoded_split(batch, 0).await; } #[tokio::test] @@ -1725,7 +1689,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4128).await; + verify_encoded_split(batch, 0).await; } /// Return size, in memory of flight data @@ -1779,8 +1743,6 @@ mod tests { let actual_overage = actual_data_size.saturating_sub(max_flight_data_size); - println!("🐈 actual_overage: {actual_overage}"); - assert!( actual_overage <= allowed_overage, "encoded data[{i}]: actual size {actual_data_size}, \ diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index f6129ddfe248..e2ffeecaae93 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -38,18 +38,34 @@ pub fn flight_data_from_arrow_batch( batch: &RecordBatch, options: &IpcWriteOptions, ) -> (Vec, FlightData) { + let (flight_dictionaries, mut flight_batches) = _flight_data_from_arrow_batch(batch, options); + + assert_eq!( + flight_batches.len(), + 1, + "encoded_batch with a max size of usize::MAX should never return more than 1 batch" + ); + let flight_batch = flight_batches.pop().unwrap(); + + (flight_dictionaries, flight_batch) +} + +fn _flight_data_from_arrow_batch( + batch: &RecordBatch, + options: &IpcWriteOptions, +) -> (Vec, Vec) { let data_gen = writer::IpcDataGenerator::default(); let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, options) + let (encoded_dictionaries, encoded_batches) = data_gen + .encoded_batch_with_size(batch, &mut dictionary_tracker, options, usize::MAX) .expect("DictionaryTracker configured above to not error on replacement"); let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); - let flight_batch = encoded_batch.into(); + let flight_batches = encoded_batches.into_iter().map(Into::into).collect(); - (flight_dictionaries, flight_batch) + (flight_dictionaries, flight_batches) } /// Convert a slice of wire protocol `FlightData`s into a vector of `RecordBatch`es @@ -154,11 +170,15 @@ pub fn batches_to_flight_data( writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); for batch in batches.iter() { - let (encoded_dictionaries, encoded_batch) = - data_gen.encoded_batch(batch, &mut dictionary_tracker, &options)?; + let (encoded_dictionaries, encoded_batch) = data_gen.encoded_batch_with_size( + batch, + &mut dictionary_tracker, + &options, + usize::MAX, + )?; dictionaries.extend(encoded_dictionaries.into_iter().map(Into::into)); - flight_data.push(encoded_batch.into()); + flight_data.extend(encoded_batch.into_iter().map(Into::into)); } let mut stream = Vec::with_capacity(1 + dictionaries.len() + flight_data.len()); diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index c8289ff446a0..3b0850ed883d 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -125,16 +125,9 @@ async fn send_batch( batch: &RecordBatch, options: &writer::IpcWriteOptions, ) -> Result { - let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, true); - - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, options) - .expect("DictionaryTracker configured above to not error on replacement"); - - let dictionary_flight_data: Vec = - encoded_dictionaries.into_iter().map(Into::into).collect(); - let mut batch_flight_data: FlightData = encoded_batch.into(); + #[expect(deprecated)] + let (dictionary_flight_data, mut batch_flight_data) = + arrow_flight::utils::flight_data_from_arrow_batch(batch, options); upload_tx .send_all(&mut stream::iter(dictionary_flight_data).map(Ok)) diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 0f404b2ae289..cbb8de6f0f2e 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -27,7 +27,7 @@ use arrow::{ buffer::Buffer, datatypes::Schema, datatypes::SchemaRef, - ipc::{self, reader, writer}, + ipc::{self, reader}, record_batch::RecordBatch, }; use arrow_flight::{ @@ -127,22 +127,16 @@ impl FlightService for FlightServiceImpl { .iter() .enumerate() .flat_map(|(counter, batch)| { - let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = - writer::DictionaryTracker::new_with_preserve_dict_id(false, true); - - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, &options) - .expect("DictionaryTracker configured above to not error on replacement"); - - let dictionary_flight_data = encoded_dictionaries.into_iter().map(Into::into); - let mut batch_flight_data: FlightData = encoded_batch.into(); + #[expect(deprecated)] + let (dictionary_flight_data, mut batch_flight_data) = + arrow_flight::utils::flight_data_from_arrow_batch(batch, &options); // Only the record batch's FlightData gets app_metadata let metadata = counter.to_string().into(); batch_flight_data.app_metadata = metadata; dictionary_flight_data + .into_iter() .chain(std::iter::once(batch_flight_data)) .map(Ok) }); diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 0820e3590827..3950f0f1ce02 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1058,7 +1058,7 @@ impl FileReader { /// Try to create a new file reader. /// /// There is no internal buffering. If buffered reads are needed you likely want to use - /// [`FileReader::try_new_buffered`] instead. + /// [`FileReader::try_new_buffered`] instead. /// /// # Errors /// diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 1ed41638a678..6ee64ca2c3ab 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -20,12 +20,15 @@ //! The `FileWriter` and `StreamWriter` have similar interfaces, //! however the `FileWriter` expects a reader that supports `Seek`ing -use std::cmp::min; -use std::collections::HashMap; -use std::io::{BufWriter, Write}; -use std::sync::Arc; +use std::{ + borrow::Borrow, + cmp::min, + collections::HashMap, + io::{BufWriter, Write}, + sync::Arc, +}; -use flatbuffers::{FlatBufferBuilder, Push, UnionWIPOffset, Vector, WIPOffset, SIZE_UOFFSET}; +use flatbuffers::{FlatBufferBuilder, UnionWIPOffset, WIPOffset}; use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; @@ -43,59 +46,6 @@ use crate::{ MessageHeader, RecordBatchBuilder, CONTINUATION_MARKER, }; -pub fn msg_overhead_guess(batch: &arrow_array::RecordBatch) -> usize { - use std::mem::size_of; - - // mostly taken calculations from arrow-ipc::writer::IpcDataGenerator::record_batch_to_bytes - - let opts = IpcWriteOptions::default(); - - const LEN_SIZE: usize = size_of::(); - - fn array_data_size(data: &ArrayData, opts: &IpcWriteOptions) -> usize { - // at the beginning of `write_array_data`, the null_buffer is added (if - // has_validity_bitmap) and that requires a count field as well - let null_buf_len = if has_validity_bitmap(data.data_type(), opts) { - LEN_SIZE - } else { - 0 - }; - // `write_buffer` writes 2 8-byte lengths per buffer per column (one for the compressed - // length, one for the uncompressed length) - let buf_lens = data.buffers().len() * LEN_SIZE * 2; - // Also added at the beginning of `write_array_data` - let field_node_len = LEN_SIZE * 2; - - let child_sizes = data - .child_data() - .iter() - .map(|child| array_data_size(child, opts)) - .sum::(); - - null_buf_len + buf_lens + field_node_len + child_sizes - } - - let column_sizes = batch - .columns() - .iter() - .map(|col| array_data_size(&col.to_data(), &opts)) - .sum::(); - - // `MessageBuilder::add_version` - let overhead = size_of::() + - // `MessageBuilder::add_header_type` - size_of::() + - // `MessageBuilder::add_bodyLength` - LEN_SIZE + - // `RecordBatchBuilder::add_length` is normally called with batch.num_rows() - LEN_SIZE + - opts.batch_compression_type.map_or(0, |ty| size_of_val(&ty) + size_of::()) + - column_sizes; - - // pad to the default alignment for writing - pad_to_alignment(opts.alignment, overhead) + overhead -} - /// IPC write options used to control the behaviour of the [`IpcDataGenerator`] #[derive(Debug, Clone)] pub struct IpcWriteOptions { @@ -513,6 +463,25 @@ impl IpcDataGenerator { dictionary_tracker: &mut DictionaryTracker, write_options: &IpcWriteOptions, ) -> Result<(Vec, EncodedData), ArrowError> { + let (encoded_dictionaries, mut encoded_messages) = + self.encoded_batch_with_size(batch, dictionary_tracker, write_options, usize::MAX)?; + + assert_eq!( + encoded_messages.len(), + 1, + "encoded_batch with max size of usize::MAX should not be able to return more than 1 batch" + ); + + Ok((encoded_dictionaries, encoded_messages.pop().unwrap())) + } + + pub fn encoded_batch_with_size( + &self, + batch: &RecordBatch, + dictionary_tracker: &mut DictionaryTracker, + write_options: &IpcWriteOptions, + max_flight_data_size: usize, + ) -> Result<(Vec, Vec), ArrowError> { let schema = batch.schema(); let mut encoded_dictionaries = Vec::with_capacity(schema.flattened_fields().len()); @@ -530,7 +499,8 @@ impl IpcDataGenerator { )?; } - let encoded_message = chunked_encoded_batch_bytes(batch, write_options)?; + let encoded_message = + chunked_encoded_batch_bytes(batch, write_options, max_flight_data_size)?; Ok((encoded_dictionaries, encoded_message)) } @@ -542,21 +512,29 @@ impl IpcDataGenerator { array_data: &ArrayData, write_options: &IpcWriteOptions, ) -> Result { - encode_array_datas( + let mut encoded_datas = encode_array_datas( // TODO: We can abstract this clone away, right? - [array_data.clone()], - array_data.len() as i64, + &[array_data.clone()], + array_data.len(), |fbb, offset| { - fbb.with_builder(DictionaryBatchBuilder::new, |mut bldr| { - bldr.add_data(DictionaryBatchBuilder::add_id, dict_id); - - bldr.builder.add_data(offset); - bldr.builder.finish().as_union_value() - }) + let mut builder = DictionaryBatchBuilder::new(&mut fbb.fbb); + builder.add_id(dict_id); + builder.add_data(offset); + builder.finish().as_union_value() }, MessageHeader::DictionaryBatch, + // ASK: No maximum message size here? + usize::MAX, write_options, - ) + )?; + + assert_eq!( + encoded_datas.len(), + 1, + "encode_array_datas with usize::MAX as the max size should never return more than a single item" + ); + + Ok(encoded_datas.pop().unwrap()) } } @@ -866,10 +844,11 @@ impl FileWriter { )); } - let (encoded_dictionaries, encoded_message) = self.data_gen.encoded_batch( + let (encoded_dictionaries, encoded_messages) = self.data_gen.encoded_batch_with_size( batch, &mut self.dictionary_tracker, &self.write_options, + usize::MAX, )?; for encoded_dictionary in encoded_dictionaries { @@ -881,15 +860,22 @@ impl FileWriter { self.block_offsets += meta + data; } - let (meta, data) = write_message(&mut self.writer, encoded_message, &self.write_options)?; - // add a record block for the footer - let block = crate::Block::new( - self.block_offsets as i64, - meta as i32, // TODO: is this still applicable? - data as i64, - ); - self.record_blocks.push(block); - self.block_offsets += meta + data; + // theoretically, since the maximum size for encoding is usize::MAX, there should never be + // more than 1 encoded message. However, since there's no need to assert that (i.e. if + // someone changes usize::MAX above to be a lower message, that's fine), we just assume + // there can be many messages + for encoded_message in encoded_messages { + let (meta, data) = + write_message(&mut self.writer, encoded_message, &self.write_options)?; + // add a record block for the footer + let block = crate::Block::new( + self.block_offsets as i64, + meta as i32, // TODO: is this still applicable? + data as i64, + ); + self.record_blocks.push(block); + self.block_offsets += meta + data; + } Ok(()) } @@ -1063,16 +1049,23 @@ impl StreamWriter { )); } - let (encoded_dictionaries, encoded_message) = self + let (encoded_dictionaries, encoded_messages) = self .data_gen - .encoded_batch(batch, &mut self.dictionary_tracker, &self.write_options) + .encoded_batch_with_size( + batch, + &mut self.dictionary_tracker, + &self.write_options, + usize::MAX, + ) .expect("StreamWriter is configured to not error on dictionary replacement"); for encoded_dictionary in encoded_dictionaries { write_message(&mut self.writer, encoded_dictionary, &self.write_options)?; } - write_message(&mut self.writer, encoded_message, &self.write_options)?; + for message in encoded_messages { + write_message(&mut self.writer, message, &self.write_options)?; + } Ok(()) } @@ -1167,6 +1160,7 @@ impl RecordBatchWriter for StreamWriter { } } +#[derive(Debug)] /// Stores the encoded data, which is an crate::Message, and optional Arrow data pub struct EncodedData { /// An encoded crate::Message @@ -1310,44 +1304,6 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } -/// Common functionality for re-encoding offsets. Returns the new offsets as well as -/// original start offset and length for use in slicing child data. -fn reencode_offsets( - offsets: &Buffer, - data: &ArrayData, -) -> (Buffer, usize, usize) { - let offsets_slice: &[O] = offsets.typed_data::(); - let offset_slice = &offsets_slice[data.offset()..data.offset() + data.len() + 1]; - - let start_offset = offset_slice.first().unwrap(); - let end_offset = offset_slice.last().unwrap(); - - let offsets = match start_offset.as_usize() { - 0 => offsets.clone(), - _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), - }; - - let start_offset = start_offset.as_usize(); - let end_offset = end_offset.as_usize(); - - (offsets, start_offset, end_offset - start_offset) -} - -/// Returns the values and offsets [`Buffer`] for a ByteArray with offset type `O` -/// -/// In particular, this handles re-encoding the offsets if they don't start at `0`, -/// slicing the values buffer as appropriate. This helps reduce the encoded -/// size of sliced arrays, as values that have been sliced away are not encoded -fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { - if data.is_empty() { - return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); - } - - let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); - let values = data.buffers()[1].slice_with_length(original_start_offset, len); - (offsets, values) -} - /// Similar logic as [`get_byte_array_buffers()`] but slices the child array instead /// of a values buffer. fn get_list_array_buffers(data: &ArrayData) -> (Buffer, ArrayData) { @@ -1358,7 +1314,7 @@ fn get_list_array_buffers(data: &ArrayData) -> (Buffer, Arra ); } - let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); + let (offsets, original_start_offset, len) = data.reencode_offsets::(); let child_data = data.child_data()[0].slice(original_start_offset, len); (offsets, child_data) } @@ -1373,157 +1329,131 @@ fn pad_to_alignment(alignment: u8, len: usize) -> usize { ((len + a) & !a) - len } -pub fn create_vector<'fbb, T>( - overhead: &mut usize, - fbb: &mut FlatBufferBuilder<'fbb>, - items: &[T], -) -> WIPOffset> -where - T: Push, -{ - *overhead += SIZE_UOFFSET; - fbb.create_vector(items) -} - fn chunked_encoded_batch_bytes( batch: &RecordBatch, write_options: &IpcWriteOptions, -) -> Result { + max_flight_data_size: usize, +) -> Result, ArrowError> { encode_array_datas( - batch.columns().iter().map(ArrayRef::to_data), - batch.num_rows() as i64, + &batch + .columns() + .iter() + .map(ArrayRef::to_data) + .collect::>(), + batch.num_rows(), |_, offset| offset.as_union_value(), MessageHeader::RecordBatch, + max_flight_data_size, write_options, ) } -fn dry_run_header_size( - arr_datas: impl IntoIterator, - n_rows: i64, - encode_root: impl FnOnce( - &mut FlatBufferSizeTracker, - WIPOffset, - ) -> WIPOffset, - header_type: MessageHeader, - write_options: &IpcWriteOptions -) -> Result { -} - fn encode_array_datas( - arr_datas: impl IntoIterator, - n_rows: i64, - encode_root: impl FnOnce( + arr_datas: &[ArrayData], + n_rows: usize, + encode_root: impl Fn( &mut FlatBufferSizeTracker, WIPOffset, ) -> WIPOffset, header_type: MessageHeader, + mut max_msg_size: usize, write_options: &IpcWriteOptions, -) -> Result { - let mut fbb = FlatBufferSizeTracker::default(); - - let batch_compression_type = write_options.batch_compression_type; +) -> Result, ArrowError> { + let mut fbb = FlatBufferSizeTracker::for_dry_run(arr_datas.len()); + fbb.encode_array_datas( + arr_datas, + n_rows as i64, + &encode_root, + header_type, + write_options, + )?; - let compression = batch_compression_type.map(|compression_type| { - fbb.with_builder(BodyCompressionBuilder::new, |mut builder| { - builder.add_data( - BodyCompressionBuilder::add_method, - BodyCompressionMethod::BUFFER, - ); - builder.add_data(BodyCompressionBuilder::add_codec, compression_type); - builder.builder.finish() + fn get_arr_batch_size>( + iter: impl Iterator, + alignment: u8, + ) -> Result { + iter.map(|arr| { + arr.borrow() + .get_slice_memory_size() + .map(|size| size + pad_to_alignment(alignment, size)) }) - }); - - let mut variadic_buffer_counts = Vec::::default(); - let mut offset = 0; - - // TODO: split these up here to only fit as many as can be fit - // - get `fbb.fbb.unfinished_data` after writing a fake piece of data for `buffers`. that - // determines the real overhead - // - split up the array datas based on how much space they take up alone - for array in arr_datas { - fbb.write_array_data( - &array, - &mut offset, - array.len(), - array.null_count(), - write_options, - )?; - - append_variadic_buffer_counts(&mut variadic_buffer_counts, &array); + .sum() } - // pad the tail of the body data - let pad_len = pad_to_alignment(write_options.alignment, fbb.arrow_data.len()); - fbb.arrow_data.extend_from_slice(&PADDING[..pad_len]); + let header_len = fbb.fbb.finished_data().len(); + max_msg_size = max_msg_size.saturating_sub(header_len).max(1); - let buffers = create_vector(&mut fbb.overhead, &mut fbb.fbb, &fbb.buffers); - let nodes = create_vector(&mut fbb.overhead, &mut fbb.fbb, &fbb.nodes); - let variadic_buffer = (!variadic_buffer_counts.is_empty()) - .then(|| create_vector(&mut fbb.overhead, &mut fbb.fbb, &variadic_buffer_counts)); + let total_size = get_arr_batch_size(arr_datas.iter(), write_options.alignment)?; - let root = fbb.with_builder(RecordBatchBuilder::new, |mut bldr| { - // Now, this is the only piece of data for this builder that didn't already come from - // flatbuffers - the rest in this closure are produced from `create_vector` or a builder's - // `finish()`, so their memory is already accounted for. That's why we use the escape hatch - // for everything else. - // - // See the comment on `add_data` for an explanation of why this is needed. - bldr.add_data(RecordBatchBuilder::add_length, n_rows); + if total_size == 0 { + // If there's nothing of size to encode, then existing tests say we shouldn't even return + // an empty EncodedData - we just return no EncodedData + return Ok(vec![]); + } - bldr.builder.add_nodes(nodes); - bldr.builder.add_buffers(buffers); - if let Some(c) = compression { - bldr.builder.add_compression(c); - } - if let Some(v) = variadic_buffer { - bldr.builder.add_variadicBufferCounts(v); - } + let n_batches = bit_util::ceil(total_size, max_msg_size); + // let rows_per_batch = n_rows / n_batches; + let mut out = Vec::with_capacity(n_batches); - bldr.builder.finish() - }); + println!("🐈 n_rows: {n_rows}, n_batches: {n_batches}, header_len: {header_len}, total_size: {total_size}, max_msg_size: {max_msg_size}"); - // Unions are stored as a tag and offset, which (afaict) are both the same size, and we're - // generally following a policy of accounting for memory as soon as possible, so we need to - // check that here (since `encode_root` returns a Union offset) - fbb.overhead += SIZE_UOFFSET * 2; - let root = encode_root(&mut fbb, root); + let mut offset = 0; + while offset < n_rows { + let length = (1..) + .find(|len| { + // If we've exhausted the available length of the array datas, then just return - + // we've got it. + if arr_datas.iter().any(|arr| arr.len() < offset + len) { + return true; + } - let arrow_len = fbb.arrow_data.len() as i64; - let msg = fbb.with_builder(MessageBuilder::new, |mut bldr| { - bldr.add_data(MessageBuilder::add_version, write_options.metadata_version); - bldr.add_data(MessageBuilder::add_header_type, header_type); - bldr.add_data(MessageBuilder::add_bodyLength, arrow_len); + let arr_iter = arr_datas.iter().map(|arr| arr.slice(offset, *len)); + + // we can unwrap this here b/c this only errors on malformed buffer-type/data-type + // combinations, and if any of these arrays had that, this function would've + // already short-circuited on an earlier call of this function + get_arr_batch_size(arr_iter, write_options.alignment).unwrap() > max_msg_size + }) + // Because this is an unbounded range, this will only panic if we reach usize::MAX and + // still haven't found anything. However, the inner part of this loop slices + // `ArrayData` with the number that we're iterating over, and that will panic if we + // reach usize::MAX because that will be bigger than it can handle to be sliced. + .unwrap() + - 1; - // `root` was already tracked when it was turned into a union - bldr.builder.add_header(root); - bldr.builder.finish() - }); + let new_arrs = arr_datas + .iter() + .map(|arr| arr.slice(offset, length)) + .collect::>(); + + fbb.reset_for_real_run(); + fbb.encode_array_datas( + &new_arrs, + length as i64, + &encode_root, + header_type, + write_options, + )?; - fbb.fbb.finish(msg, None); - let ipc_message = fbb.fbb.finished_data().to_vec(); + let finished_data = fbb.fbb.finished_data(); + println!( + "🐈 arrow_data: {}, finished_data: {} (n_rows: {length})", + fbb.arrow_data.len(), + finished_data.len() + ); - println!("🐈 ipc_message size: {}", ipc_message.len()); + out.push(EncodedData { + ipc_message: finished_data.to_vec(), + arrow_data: fbb.arrow_data.clone(), + }); + offset += length; + } - Ok(EncodedData { - ipc_message, - arrow_data: fbb.arrow_data, - }) + Ok(out) } #[derive(Default)] struct FlatBufferSizeTracker<'fbb> { - // tracks the amount of data that is going to be encoded into the final FlatBuffer (or is - // already encoded) but is not: - // 1. Already tracked by a different field here. E.g. We know that each node in `self.nodes` - // must be encoded into the final thing, so we don't bother tracking the space that those - // will take up in `overhead`, and instead just check `nodes.len() * size_of()` - // instead. - // 2. raw ArrayData that will be encoded into the final product. This overhead tracking is done - // to figure out how many RecordBatches we can encode into each FlatBuffer, so we just need - // to keep track of everything besides those batches/ArrayData. - pub overhead: usize, // the builder and backing flatbuffer that we use to write the arrow data into. fbb: FlatBufferBuilder<'fbb>, // tracks the data in `arrow_data` - `buffers` contains the offsets and length of different @@ -1532,27 +1462,113 @@ struct FlatBufferSizeTracker<'fbb> { // the raw array data that we need to send across the wire arrow_data: Vec, nodes: Vec, + dry_run: bool, } impl<'fbb> FlatBufferSizeTracker<'fbb> { - fn with_builder<'s, B, BFn, F, O>(&'s mut self, b_fn: BFn, f: F) -> O - where - BFn: Fn(&'s mut FlatBufferBuilder<'fbb>) -> B, - F: Fn(BuilderTracker<'s, B>) -> O, - { - let builder = b_fn(&mut self.fbb); - let tracker = BuilderTracker { - overhead: &mut self.overhead, - builder, - }; + #[must_use] + fn for_dry_run(capacity: usize) -> Self { + Self { + dry_run: true, + buffers: Vec::with_capacity(capacity), + nodes: Vec::with_capacity(capacity), + ..Self::default() + } + } - f(tracker) + fn reset_for_real_run(&mut self) { + self.fbb.reset(); + self.buffers.clear(); + self.arrow_data.clear(); + self.nodes.clear(); + self.dry_run = false; + + // this helps us avoid completely re-allocating the buffers by just creating a new `Self`. + // So everything should be allocated correctly now besides arrow_data. If we're calling + // this after only a dry run, `arrow_data` shouldn't have anything written into it, but + // we call this after every real run loop, so we still need to clear it. } - pub fn total_overhead(&self) -> usize { - self.overhead - + (self.nodes.len() * size_of::<::Output>()) - + (self.buffers.len() * size_of::<::Output>()) + fn encode_array_datas( + &mut self, + arr_datas: &[ArrayData], + n_rows: i64, + encode_root: impl FnOnce( + &mut FlatBufferSizeTracker, + WIPOffset, + ) -> WIPOffset, + header_type: MessageHeader, + write_options: &IpcWriteOptions, + ) -> Result<(), ArrowError> { + let batch_compression_type = write_options.batch_compression_type; + + let compression = batch_compression_type.map(|compression_type| { + let mut builder = BodyCompressionBuilder::new(&mut self.fbb); + builder.add_method(BodyCompressionMethod::BUFFER); + builder.add_codec(compression_type); + builder.finish() + }); + + let mut variadic_buffer_counts = Vec::::default(); + let mut offset = 0; + + // TODO: split these up here to only fit as many as can be fit + // - get `fbb.fbb.unfinished_data` after writing a fake piece of data for `buffers`. that + // determines the real overhead + // - split up the array datas based on how much space they take up alone + for array in arr_datas { + self.write_array_data( + array, + &mut offset, + array.len(), + array.null_count(), + write_options, + )?; + + append_variadic_buffer_counts(&mut variadic_buffer_counts, array); + } + + // pad the tail of the body data + let pad_len = pad_to_alignment(write_options.alignment, self.arrow_data.len()); + self.arrow_data.extend_from_slice(&PADDING[..pad_len]); + + let buffers = self.fbb.create_vector(&self.buffers); + let nodes = self.fbb.create_vector(&self.nodes); + let variadic_buffer = (!variadic_buffer_counts.is_empty()) + .then(|| self.fbb.create_vector(&variadic_buffer_counts)); + + let root = { + let mut builder = RecordBatchBuilder::new(&mut self.fbb); + + builder.add_length(n_rows); + builder.add_nodes(nodes); + builder.add_buffers(buffers); + if let Some(c) = compression { + builder.add_compression(c); + } + if let Some(v) = variadic_buffer { + builder.add_variadicBufferCounts(v); + } + + builder.finish() + }; + + let root = encode_root(self, root); + + let arrow_len = self.arrow_data.len() as i64; + let msg = { + let mut builder = MessageBuilder::new(&mut self.fbb); + builder.add_version(write_options.metadata_version); + builder.add_header_type(header_type); + builder.add_bodyLength(arrow_len); + + // `root` was already tracked when it was turned into a union + builder.add_header(root); + builder.finish() + }; + + self.fbb.finish(msg, None); + Ok(()) } fn write_array_data( @@ -1591,6 +1607,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { Some(buffer) => buffer.inner().sliced(), }; + println!("🐈 writing null buffer"); self.write_buffer( null_buffer.as_slice(), offset, @@ -1600,40 +1617,21 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_byte_array_byffers = |(offsets, values): (Buffer, Buffer)| { - for buffer in [offsets, values] { - self.write_buffer( - buffer.as_slice(), - offset, - compression_codec, - write_options.alignment, - )?; + for (idx, buffer) in [offsets, values].into_iter().enumerate() { + println!("🐈 writing buf {idx} of tuple"); + self.write_buffer(&buffer, offset, compression_codec, write_options.alignment)?; } Ok::<_, ArrowError>(()) }; + println!("🐈 writing array dt: {:?}", array_data.data_type()); + match array_data.data_type() { DataType::Binary | DataType::Utf8 => { - write_byte_array_byffers(get_byte_array_buffers::(array_data))? + write_byte_array_byffers(array_data.get_byte_array_buffers::())? } DataType::LargeBinary | DataType::LargeUtf8 => { - write_byte_array_byffers(get_byte_array_buffers::(array_data))? - } - DataType::BinaryView | DataType::Utf8View => { - // Slicing the views buffer is safe and easy, - // but pruning unneeded data buffers is much more nuanced since it's complicated - // to prove that no views reference the pruned buffers - // - // Current implementation just serialize the raw arrays as given and not try to optimize anything. - // If users wants to "compact" the arrays prior to sending them over IPC, - // they should consider the gc API suggested in #5513 - for buffer in array_data.buffers() { - self.write_buffer( - buffer.as_slice(), - offset, - compression_codec, - write_options.alignment, - )?; - } + write_byte_array_byffers(array_data.get_byte_array_buffers::())? } dt if DataType::is_numeric(dt) || DataType::is_temporal(dt) @@ -1703,10 +1701,17 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { sliced_child_data.null_count(), write_options, )?; - println!("🐈 total_overhead: {}", self.total_overhead()); return Ok(()); } _ => { + // This accommodates for even the `View` types (e.g. BinaryView and Utf8View): + // Slicing the views buffer is safe and easy, + // but pruning unneeded data buffers is much more nuanced since it's complicated + // to prove that no views reference the pruned buffers + // + // Current implementation just serialize the raw arrays as given and not try to optimize anything. + // If users wants to "compact" the arrays prior to sending them over IPC, + // they should consider the gc API suggested in #5513 for buffer in array_data.buffers() { self.write_buffer(buffer, offset, compression_codec, write_options.alignment)?; } @@ -1714,7 +1719,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_arr = |arr: &ArrayData| { - for data_ref in arr.child_data() { + for (idx, data_ref) in arr.child_data().iter().enumerate() { + println!("🐈 writing child array {idx}"); self.write_array_data( data_ref, offset, @@ -1726,15 +1732,13 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { Ok::<_, ArrowError>(()) }; - let res = match array_data.data_type() { + match array_data.data_type() { DataType::Dictionary(_, _) => Ok(()), // unslice the run encoded array. DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data.clone())?), // recursively write out nested structures _ => write_arr(array_data), - }; - println!("🐈 total_overhead: {}", self.total_overhead()); - res + } } /// Write a buffer into `arrow_data`, a vector of bytes, and adds its @@ -1757,19 +1761,28 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { compression_codec: Option, alignment: u8, ) -> Result<(), ArrowError> { - let len: i64 = match compression_codec { - Some(compressor) => compressor.compress_to_vec(buffer, &mut self.arrow_data)?, - None => { - self.arrow_data.extend_from_slice(buffer); - buffer.len() + println!("🐈 writing buffer of size {}", buffer.len()); + + let len: i64 = if self.dry_run { + // Flatbuffers will essentially optimize this away if we say the len is 0 for all of + // these, so to make sure the header size is the same in the dry run and in the real + // thing, we need to set this to a non-zero value + 1 + } else { + match compression_codec { + Some(compressor) => compressor.compress_to_vec(buffer, &mut self.arrow_data)?, + None => { + self.arrow_data.extend_from_slice(buffer); + buffer.len() + } } - } - .try_into() - .map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Could not convert compressed size to i64: {e}" - )) - })?; + .try_into() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Could not convert compressed size to i64: {e}" + )) + })? + }; // make new index entry self.buffers.push(crate::Buffer::new(*offset, len)); @@ -1777,38 +1790,11 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { let pad_len = pad_to_alignment(alignment, len as usize); self.arrow_data.extend_from_slice(&PADDING[..pad_len]); - // We don't add pad_len to self.overhead here mainly because it feels over-zealous to do - // so - Once compression is incorporated into this sort of schema, where you're trying to - // count bits to fit as much data into these fields as possible, - *offset += len + (pad_len as i64); Ok(()) } } -struct BuilderTracker<'tracker, B> { - overhead: &'tracker mut usize, - pub builder: B, -} - -impl<'tracker, B> BuilderTracker<'tracker, B> { - // This is a method you have to be careful about if you want exact accounting - Specifically, - // there's no way (as far as I can figure out, at least) for this method to not count overhead - // when the data being added was already pulled from the flatbuffer it's being added to. For - // example, if we call `create_vector` on our FlatBufferSizeTracker, then add that to a `B` - // using this method, the `Vector`'s size has already been accounted for, so we don't want to - // track it again here. Same thing if we produce a value from a Builder's `finish()` method, - // then add it with another builder. - // - // Now, we could to like `D: FlatBufferSized` and then impl that trait for everything and add - // specializations for `flatbuffers::WIPOffset` and such... IF we had specialization. Alas, any - // sort of blanket-impl-plus-caveats solution requires specialization. - fn add_data(&mut self, f: F, data: D) { - f(&mut self.builder, data); - *self.overhead += std::mem::size_of::(); - } -} - #[cfg(test)] mod tests { use std::io::Cursor; From ce7d0dab07c7c996cd74b77b1fdda7cb9c7342bf Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Fri, 1 Nov 2024 18:27:43 -0600 Subject: [PATCH 03/20] Oh we're fixing things even more --- .../src/array/fixed_size_list_array.rs | 4 +- arrow-data/src/data.rs | 187 +++++++++++------- arrow-flight/src/encode.rs | 10 +- arrow-ipc/src/writer.rs | 33 ++-- 4 files changed, 142 insertions(+), 92 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 72855cef1f04..37446e234522 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -521,7 +521,9 @@ mod tests { } #[test] - #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] + #[should_panic( + expected = "Attempting to slice an array with offset 0, len 9 when self.len is 8" + )] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 245ea0d4009e..b704f8545b5e 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -423,73 +423,102 @@ impl ArrayData { size } - /// Returns the total number of the bytes of memory occupied by - /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). - /// - /// This is approximately the number of bytes if a new - /// [`ArrayData`] was formed by creating new [`Buffer`]s with - /// exactly the data needed. - /// - /// For example, a [`DataType::Int64`] with `100` elements, - /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If - /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its - /// first `20` elements, then [`Self::get_slice_memory_size`] on the - /// sliced [`ArrayData`] would return `20 * 8 = 160`. - pub fn get_slice_memory_size(&self) -> Result { - let mut result: usize = 0; + pub fn get_slice_memory_size_with_alignment(&self, alignment: Option) -> Result { let layout = layout(&self.data_type); - for spec in layout.buffers.iter() { - match spec { + // TODO: I pulled this from arrow-ipc, we should extract it to something where they can + // share code + #[inline] + fn pad_to_alignment(alignment: u8, len: usize) -> usize { + let a = usize::from(alignment - 1); + ((len + a) & !a) - len + } + + let mut result = layout.buffers.iter().map(|spec| { + let size = match spec { BufferSpec::FixedWidth { byte_width, .. } => { - let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { + let num_elems = match self.data_type { + // On these offsets-plus-values datatypes, their offset buffer (which is + // FixedWidth and thus the one we're looking at right now in this + // FixedWidth arm) contains self.len + 1 elements due to the way the + // offsets are encoded as overlapping pairs of (start, (end+start), + // (end+start), etc). + DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => self.len + 1, + _ => self.len + }; + + num_elems.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) - })?; - result += buffer_size; + }) } - BufferSpec::VariableWidth => { - result += match &self.data_type { - DataType::Utf8 | DataType::Binary => { - let (buf1, buf2) = self.get_byte_array_buffers::(); - buf1.len() + buf2.len() - } - DataType::LargeUtf8 | DataType::LargeBinary => { - let (buf1, buf2) = self.get_byte_array_buffers::(); - buf1.len() + buf2.len() - } - // `FlatBufferSizeTracker::write_array_data` just writes these just as raw - // buffer data as opposed to using offsets or anything - DataType::BinaryView | DataType::Utf8View => self.buffers() + BufferSpec::VariableWidth => match &self.data_type { + // UTF8 and Binary have two buffers - one for the offsets, one for the values. + // When calculating size, the offset buffer's size is calculated by the + // FixedWidth buffer arm above, so we just need to count the offsets here. + DataType::Utf8 | DataType::Binary => { + self.typed_offsets::() + .map(|off| (off[self.len] - off[0]) as usize) + } + DataType::LargeUtf8 | DataType::LargeBinary => { + self.typed_offsets::() + .map(|off| (off[self.len] - off[0]) as usize) + } + // `FlatBufferSizeTracker::write_array_data` just writes these just as raw + // buffer data as opposed to using offsets or anything + DataType::BinaryView | DataType::Utf8View => Ok( + self.buffers() .iter() .map(|buf| buf.len()) - .sum::(), - dt => { - return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {dt}", - ))) - } - }; - } - BufferSpec::BitMap => { - let buffer_size = bit_util::ceil(self.len, 8); - result += buffer_size; - } - BufferSpec::AlwaysNull => { - // Nothing to do + .sum::() + ), + dt => Err(ArrowError::NotYetImplemented(format!( + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {dt}", + ))), } - } - } + BufferSpec::BitMap => Ok(bit_util::ceil(self.len, 8)), + // Nothing to do when AlwaysNull + BufferSpec::AlwaysNull => Ok(0) + }?; + + Ok(size + alignment.map_or(0, |a| pad_to_alignment(a, size))) + }).sum::>()?; + + println!("🏹 after sum: {result}"); if self.nulls().is_some() { result += bit_util::ceil(self.len, 8); } + if let Some(a) = alignment { + result += pad_to_alignment(a, result); + } + + println!("🏹 after null mask: {result}"); + for child in &self.child_data { - result += child.get_slice_memory_size()?; + result += child.get_slice_memory_size_with_alignment(alignment)?; } + Ok(result) + + } + + /// Returns the total number of the bytes of memory occupied by + /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). + /// + /// This is approximately the number of bytes if a new + /// [`ArrayData`] was formed by creating new [`Buffer`]s with + /// exactly the data needed. + /// + /// For example, a [`DataType::Int64`] with `100` elements, + /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If + /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its + /// first `20` elements, then [`Self::get_slice_memory_size`] on the + /// sliced [`ArrayData`] would return `20 * 8 = 160`. + pub fn get_slice_memory_size(&self) -> Result { + self.get_slice_memory_size_with_alignment(None) } /// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` @@ -497,6 +526,12 @@ impl ArrayData { /// In particular, this handles re-encoding the offsets if they don't start at `0`, /// slicing the values buffer as appropriate. This helps reduce the encoded /// size of sliced arrays, as values that have been sliced away are not encoded + /// + /// # Panics + /// + /// Panics if self.buffers does not contain at least 2 buffers (this code expects that the + /// first will contain the offsets for this variable-length array and the other will contain + /// the values) pub fn get_byte_array_buffers(&self) -> (Buffer, Buffer) { if self.is_empty() { return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); @@ -504,8 +539,9 @@ impl ArrayData { // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice - // to only get the last two, they'll be shifted to be [0, 2], since those would be the - // offsets for the last two values in this shifted slice). + // to only get the last offset, they'll be shifted to be [0, 2], since that would be the + // offset pair for the last value in this shifted slice). + // also, in this example, original_start_offset would be 5 and len would be 2. let (offsets, original_start_offset, len) = self.reencode_offsets::(); let values = self.buffers()[1].slice_with_length(original_start_offset, len); (offsets, values) @@ -517,25 +553,23 @@ impl ArrayData { /// # Panics /// /// Will panic if you call this on an `ArrayData` that does not have a buffer of offsets as the - /// very first buffer (like most variable length arrays) + /// very first buffer (i.e. expects this to be a valid variable-length array) pub fn reencode_offsets(&self) -> (Buffer, usize, usize) { // first we want to see: what is the offset of this `ArrayData` into the buffer (which is a // buffer of offsets into the buffer of data) let orig_offset = self.offset(); - let o_size = mem::size_of::(); - // and then slice the buffer to only get the part we care about, keeping in mind the size - // of the type that we're treating it as (since it's just encoded as a bunch of u8s as the - // base level) - let mut offsets = - self.buffers()[0].slice_with_length(orig_offset * o_size, self.len * o_size); + // and also we need to get the buffer of offsets + let offsets_buf = &self.buffers()[0]; // then we have to turn it into a typed slice that we can read and manipulate below if - // needed - let offsets_slice = offsets.typed_data::(); + // needed, and slice it according to the size that we need to return + // we need to do `self.len + 1` instead of just `self.len` because the offsets are encoded + // as overlapping pairs - e.g. an array of two items, starting at idx 0, and spanning two + // each, would be encoded as [0, 2, 4]. + let offsets_slice = &offsets_buf.typed_data::()[orig_offset..][..self.len + 1]; // and now we can see what the very first offset and the very last offset is - let start_offset_o = offsets_slice.first().unwrap(); - let start_offset = start_offset_o.as_usize(); + let start_offset = offsets_slice.first().unwrap(); let end_offset = offsets_slice.last().unwrap().as_usize(); // if the start offset is just 0, i.e. it points to the very beginning of the values of @@ -543,8 +577,12 @@ impl ArrayData { // all the offsets in this buffer are already offset by 0. // But if it's not 0, then we need to shift them all so that the offsets don't start at // some weird value. - if start_offset != 0 { - offsets = offsets_slice.iter().map(|x| *x - *start_offset_o).collect(); + let (start_offset, offsets) = match start_offset.as_usize() { + 0 => (0, offsets_slice.to_vec().into()), + start => ( + start, + offsets_slice.iter().map(|x| *x - *start_offset).collect(), + ), }; (offsets, start_offset, end_offset - start_offset) @@ -590,11 +628,10 @@ impl ArrayData { if let DataType::Struct(_) = self.data_type() { // Slice into children - let new_offset = self.offset + offset; - let new_data = ArrayData { + ArrayData { data_type: self.data_type().clone(), len: length, - offset: new_offset, + offset: self.offset + offset, buffers: self.buffers.clone(), // Slice child data, to propagate offsets down to them child_data: self @@ -603,9 +640,7 @@ impl ArrayData { .map(|data| data.slice(offset, length.min(data.len()))) .collect(), nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), - }; - - new_data + } } else { let mut new_data = self.clone(); @@ -942,6 +977,7 @@ impl ArrayData { let required_len = (len + self.offset) * mem::size_of::(); if buffer.len() < required_len { + // panic!("just get it over with"); return Err(ArrowError::InvalidArgumentError(format!( "Buffer {} of {} isn't large enough. Expected {} bytes got {}", idx, @@ -951,7 +987,7 @@ impl ArrayData { ))); } - Ok(&buffer.typed_data::()[self.offset..self.offset + len]) + Ok(&buffer.typed_data::()[self.offset..][..len]) } /// Does a cheap sanity check that the `self.len` values in `buffer` are valid @@ -2249,11 +2285,12 @@ mod tests { ) .unwrap(); let string_data_slice = string_data.slice(1, 2); + + let data_len = string_data.get_slice_memory_size().unwrap(); + println!("🥩 checking slice, have {data_len}!"); + let slice_len = string_data_slice.get_slice_memory_size().unwrap(); //4 bytes of offset and 2 bytes of data reduced by slicing. - assert_eq!( - string_data.get_slice_memory_size().unwrap() - 6, - string_data_slice.get_slice_memory_size().unwrap() - ); + assert_eq!(data_len - 6, slice_len); } #[test] diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index ef849b1cdc01..600cf827d4ee 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -983,12 +983,16 @@ mod tests { ))], ); - struct_builder.field_builder::>>>(0).unwrap().append_value(vec![Some("a"), None, Some("b")]); + struct_builder.field_builder::>>>(0) + .unwrap() + .append_value(vec![Some("a"), None, Some("b")]); struct_builder.append(true); let arr1 = struct_builder.finish(); - struct_builder.field_builder::>>>(0).unwrap().append_value(vec![Some("c"), None, Some("d")]); + struct_builder.field_builder::>>>(0) + .unwrap() + .append_value(vec![Some("c"), None, Some("d")]); struct_builder.append(true); let arr2 = struct_builder.finish(); @@ -1497,7 +1501,7 @@ mod tests { .encoded_batch_with_size(&batch, &mut dict_tracker, &write_opts, max_flight_data_size) .unwrap() .1; - assert_eq!(split.len(), 3); + assert_eq!(split.len(), 2); /*assert_eq!( split.iter().map(|batch| batch.num_rows()).sum::(), n_rows diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 6ee64ca2c3ab..3aafea427c2c 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1370,12 +1370,17 @@ fn encode_array_datas( fn get_arr_batch_size>( iter: impl Iterator, - alignment: u8, + write_options: &IpcWriteOptions, ) -> Result { iter.map(|arr| { - arr.borrow() - .get_slice_memory_size() - .map(|size| size + pad_to_alignment(alignment, size)) + let arr = arr.borrow(); + arr.get_slice_memory_size_with_alignment(Some(write_options.alignment)) + .map(|size| if has_validity_bitmap(arr.data_type(), write_options) && arr.nulls().is_none() { + let null_len = bit_util::ceil(arr.len(), 8); + size + null_len + pad_to_alignment(write_options.alignment, null_len) + } else { + size + }) }) .sum() } @@ -1383,24 +1388,25 @@ fn encode_array_datas( let header_len = fbb.fbb.finished_data().len(); max_msg_size = max_msg_size.saturating_sub(header_len).max(1); - let total_size = get_arr_batch_size(arr_datas.iter(), write_options.alignment)?; + let total_size = get_arr_batch_size(arr_datas.iter(), write_options)?; - if total_size == 0 { + /*if total_size == 0 { // If there's nothing of size to encode, then existing tests say we shouldn't even return // an empty EncodedData - we just return no EncodedData return Ok(vec![]); - } + }*/ let n_batches = bit_util::ceil(total_size, max_msg_size); - // let rows_per_batch = n_rows / n_batches; let mut out = Vec::with_capacity(n_batches); println!("🐈 n_rows: {n_rows}, n_batches: {n_batches}, header_len: {header_len}, total_size: {total_size}, max_msg_size: {max_msg_size}"); let mut offset = 0; - while offset < n_rows { + while offset < n_rows.max(1) { let length = (1..) .find(|len| { + println!("🛑 break"); + // If we've exhausted the available length of the array datas, then just return - // we've got it. if arr_datas.iter().any(|arr| arr.len() < offset + len) { @@ -1412,14 +1418,13 @@ fn encode_array_datas( // we can unwrap this here b/c this only errors on malformed buffer-type/data-type // combinations, and if any of these arrays had that, this function would've // already short-circuited on an earlier call of this function - get_arr_batch_size(arr_iter, write_options.alignment).unwrap() > max_msg_size + get_arr_batch_size(arr_iter, write_options).unwrap() > max_msg_size }) // Because this is an unbounded range, this will only panic if we reach usize::MAX and // still haven't found anything. However, the inner part of this loop slices // `ArrayData` with the number that we're iterating over, and that will panic if we // reach usize::MAX because that will be bigger than it can handle to be sliced. - .unwrap() - - 1; + .unwrap() - 1; let new_arrs = arr_datas .iter() @@ -1761,7 +1766,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { compression_codec: Option, alignment: u8, ) -> Result<(), ArrowError> { - println!("🐈 writing buffer of size {}", buffer.len()); + let start_len = self.arrow_data.len(); let len: i64 = if self.dry_run { // Flatbuffers will essentially optimize this away if we say the len is 0 for all of @@ -1790,6 +1795,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { let pad_len = pad_to_alignment(alignment, len as usize); self.arrow_data.extend_from_slice(&PADDING[..pad_len]); + println!("🐈 writing buffer of size {} - arrow_data went {} => {}", buffer.len(), start_len, self.arrow_data.len()); + *offset += len + (pad_len as i64); Ok(()) } From fb5c35fdaaa9217ec1ba9760c7b2a5d3fe9cd183 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 15:01:27 -0700 Subject: [PATCH 04/20] ok it finally works correctly --- arrow-buffer/src/buffer/immutable.rs | 6 +- arrow-data/src/data.rs | 85 +---------- arrow-flight/src/encode.rs | 58 ++++--- arrow-ipc/src/writer.rs | 221 +++++++++++++++++---------- 4 files changed, 190 insertions(+), 180 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8d1a46583fca..816ea1ad8371 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -261,11 +261,11 @@ impl Buffer { } /// Returns a slice of this buffer starting at a certain bit offset. - /// If the offset is byte-aligned the returned buffer is a shallow clone, + /// If the offset and length are byte-aligned the returned buffer is a shallow clone, /// otherwise a new buffer is allocated and filled with a copy of the bits in the range. pub fn bit_slice(&self, offset: usize, len: usize) -> Self { - if offset % 8 == 0 { - return self.slice(offset / 8); + if offset % 8 == 0 && len % 8 == 0 { + return self.slice_with_length(offset / 8, len / 8); } bitwise_unary_op_helper(self, offset, len, |a| a) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index b704f8545b5e..2537378a9336 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -423,7 +423,10 @@ impl ArrayData { size } - pub fn get_slice_memory_size_with_alignment(&self, alignment: Option) -> Result { + pub fn get_slice_memory_size_with_alignment( + &self, + alignment: Option, + ) -> Result { let layout = layout(&self.data_type); // TODO: I pulled this from arrow-ipc, we should extract it to something where they can @@ -485,24 +488,16 @@ impl ArrayData { Ok(size + alignment.map_or(0, |a| pad_to_alignment(a, size))) }).sum::>()?; - println!("🏹 after sum: {result}"); - if self.nulls().is_some() { - result += bit_util::ceil(self.len, 8); - } - - if let Some(a) = alignment { - result += pad_to_alignment(a, result); + let null_len = bit_util::ceil(self.len, 8); + result += null_len + alignment.map_or(0, |a| pad_to_alignment(a, null_len)); } - println!("🏹 after null mask: {result}"); - for child in &self.child_data { result += child.get_slice_memory_size_with_alignment(alignment)?; } Ok(result) - } /// Returns the total number of the bytes of memory occupied by @@ -521,73 +516,6 @@ impl ArrayData { self.get_slice_memory_size_with_alignment(None) } - /// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` - /// - /// In particular, this handles re-encoding the offsets if they don't start at `0`, - /// slicing the values buffer as appropriate. This helps reduce the encoded - /// size of sliced arrays, as values that have been sliced away are not encoded - /// - /// # Panics - /// - /// Panics if self.buffers does not contain at least 2 buffers (this code expects that the - /// first will contain the offsets for this variable-length array and the other will contain - /// the values) - pub fn get_byte_array_buffers(&self) -> (Buffer, Buffer) { - if self.is_empty() { - return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); - } - - // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice - // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice - // to only get the last offset, they'll be shifted to be [0, 2], since that would be the - // offset pair for the last value in this shifted slice). - // also, in this example, original_start_offset would be 5 and len would be 2. - let (offsets, original_start_offset, len) = self.reencode_offsets::(); - let values = self.buffers()[1].slice_with_length(original_start_offset, len); - (offsets, values) - } - - /// Common functionality for re-encoding offsets. Returns the new offsets as well as - /// original start offset and length for use in slicing child data. - /// - /// # Panics - /// - /// Will panic if you call this on an `ArrayData` that does not have a buffer of offsets as the - /// very first buffer (i.e. expects this to be a valid variable-length array) - pub fn reencode_offsets(&self) -> (Buffer, usize, usize) { - // first we want to see: what is the offset of this `ArrayData` into the buffer (which is a - // buffer of offsets into the buffer of data) - let orig_offset = self.offset(); - // and also we need to get the buffer of offsets - let offsets_buf = &self.buffers()[0]; - - // then we have to turn it into a typed slice that we can read and manipulate below if - // needed, and slice it according to the size that we need to return - // we need to do `self.len + 1` instead of just `self.len` because the offsets are encoded - // as overlapping pairs - e.g. an array of two items, starting at idx 0, and spanning two - // each, would be encoded as [0, 2, 4]. - let offsets_slice = &offsets_buf.typed_data::()[orig_offset..][..self.len + 1]; - - // and now we can see what the very first offset and the very last offset is - let start_offset = offsets_slice.first().unwrap(); - let end_offset = offsets_slice.last().unwrap().as_usize(); - - // if the start offset is just 0, i.e. it points to the very beginning of the values of - // this `ArrayData`, then we don't need to shift anything to be a 'correct' offset 'cause - // all the offsets in this buffer are already offset by 0. - // But if it's not 0, then we need to shift them all so that the offsets don't start at - // some weird value. - let (start_offset, offsets) = match start_offset.as_usize() { - 0 => (0, offsets_slice.to_vec().into()), - start => ( - start, - offsets_slice.iter().map(|x| *x - *start_offset).collect(), - ), - }; - - (offsets, start_offset, end_offset - start_offset) - } - /// Returns the total number of bytes of memory occupied /// physically by this [`ArrayData`] and all its [`Buffer`]s and /// children. (See also diagram on [`ArrayData`]). @@ -2287,7 +2215,6 @@ mod tests { let string_data_slice = string_data.slice(1, 2); let data_len = string_data.get_slice_memory_size().unwrap(); - println!("🥩 checking slice, have {data_len}!"); let slice_len = string_data_slice.get_slice_memory_size().unwrap(); //4 bytes of offset and 2 bytes of data reduced by slicing. assert_eq!(data_len - 6, slice_len); diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 600cf827d4ee..7b14c9d18d20 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -327,6 +327,10 @@ impl FlightDataEncoder { /// Encodes batch into one or more `FlightData` messages in self.queue fn encode_batch(&mut self, batch: RecordBatch) -> Result<()> { + if batch.num_rows() == 0 { + return Ok(()); + } + let schema = match &self.schema { Some(schema) => schema.clone(), // encode the schema if this is the first time we have seen it @@ -338,7 +342,6 @@ impl FlightDataEncoder { DictionaryHandling::Hydrate => hydrate_dictionaries(&batch, schema)?, }; - // ASK: Is there some sort of order that these need to go through in? Does this mess it up? let (flight_dictionaries, flight_batches) = self .encoder .encode_batch(&batch, self.max_flight_data_size)?; @@ -595,7 +598,7 @@ impl FlightIpcEncoder { } /// Convert a `RecordBatch` to a Vec of `FlightData` representing - /// dictionaries and a `FlightData` representing the batch + /// dictionaries and a Vec of `FlightData`s representing the batch fn encode_batch( &mut self, batch: &RecordBatch, @@ -1200,6 +1203,11 @@ mod tests { .into_iter() .collect::(); + let mut field_types = union_fields.iter().map(|(_, field)| field.data_type()); + let dict_list_ty = field_types.next().unwrap(); + let struct_ty = field_types.next().unwrap(); + let string_ty = field_types.next().unwrap(); + let struct_fields = vec![Field::new_list( "dict_list", Field::new_dictionary("item", DataType::UInt16, DataType::Utf8, true), @@ -1218,9 +1226,9 @@ mod tests { type_id_buffer, None, vec![ - Arc::new(arr1) as Arc, - new_null_array(union_fields.iter().nth(1).unwrap().1.data_type(), 1), - new_null_array(union_fields.iter().nth(2).unwrap().1.data_type(), 1), + Arc::new(arr1), + new_null_array(struct_ty, 1), + new_null_array(string_ty, 1), ], ) .unwrap(); @@ -1236,9 +1244,9 @@ mod tests { type_id_buffer, None, vec![ - new_null_array(union_fields.iter().next().unwrap().1.data_type(), 1), + new_null_array(dict_list_ty, 1), Arc::new(arr2), - new_null_array(union_fields.iter().nth(2).unwrap().1.data_type(), 1), + new_null_array(string_ty, 1), ], ) .unwrap(); @@ -1249,8 +1257,8 @@ mod tests { type_id_buffer, None, vec![ - new_null_array(union_fields.iter().next().unwrap().1.data_type(), 1), - new_null_array(union_fields.iter().nth(1).unwrap().1.data_type(), 1), + new_null_array(dict_list_ty, 1), + new_null_array(struct_ty, 1), Arc::new(StringArray::from(vec!["e"])), ], ) @@ -1725,7 +1733,6 @@ mod tests { /// Note this overhead will likely always be greater than zero to /// account for encoding overhead such as IPC headers and padding. /// - /// async fn verify_encoded_split(batch: RecordBatch, allowed_overage: usize) { let num_rows = batch.num_rows(); @@ -1735,28 +1742,39 @@ mod tests { for max_flight_data_size in [1024, 2021, 5000] { println!("Encoding {num_rows} with a maximum size of {max_flight_data_size}"); - let mut stream = FlightDataEncoderBuilder::new() + let stream = FlightDataEncoderBuilder::new() .with_max_flight_data_size(max_flight_data_size) // use 8-byte alignment - default alignment is 64 which produces bigger ipc data .with_options(IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap()) .build(futures::stream::iter([Ok(batch.clone())])); + let mut stream = FlightDataDecoder::new(stream); + let mut i = 0; while let Some(data) = stream.next().await.transpose().unwrap() { - let actual_data_size = flight_data_size(&data); + let actual_data_size = flight_data_size(&data.inner); let actual_overage = actual_data_size.saturating_sub(max_flight_data_size); - assert!( - actual_overage <= allowed_overage, - "encoded data[{i}]: actual size {actual_data_size}, \ - actual_overage: {actual_overage} \ - allowed_overage: {allowed_overage}" - ); + let is_1_row = + matches!(data.payload, DecodedPayload::RecordBatch(rb) if rb.num_rows() == 1); + + // If only 1 row was sent over via this recordBatch, there was no way to avoid + // going over the limit. There's currently no mechanism for splitting a single row + // of results over multiple messages, so we allow going over the limit if it's the + // bare minimum over (1 row) + if !is_1_row { + assert!( + actual_overage <= allowed_overage, + "encoded data[{i}]: actual size {actual_data_size}, \ + actual_overage: {actual_overage} \ + allowed_overage: {allowed_overage}" + ); + + max_overage_seen = max_overage_seen.max(actual_overage); + } i += 1; - - max_overage_seen = max_overage_seen.max(actual_overage) } } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 3aafea427c2c..af188baf9014 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1304,6 +1304,73 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } +/// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` +/// +/// In particular, this handles re-encoding the offsets if they don't start at `0`, +/// slicing the values buffer as appropriate. This helps reduce the encoded +/// size of sliced arrays, as values that have been sliced away are not encoded +/// +/// # Panics +/// +/// Panics if self.buffers does not contain at least 2 buffers (this code expects that the +/// first will contain the offsets for this variable-length array and the other will contain +/// the values) +pub fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { + if data.is_empty() { + return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); + } + + // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice + // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice + // to only get the last offset, they'll be shifted to be [0, 2], since that would be the + // offset pair for the last value in this shifted slice). + // also, in this example, original_start_offset would be 5 and len would be 2. + let (offsets, original_start_offset, len) = reencode_offsets::(data); + let values = data.buffers()[1].slice_with_length(original_start_offset, len); + (offsets, values) +} + +/// Common functionality for re-encoding offsets. Returns the new offsets as well as +/// original start offset and length for use in slicing child data. +/// +/// # Panics +/// +/// Will panic if you call this on an `ArrayData` that does not have a buffer of offsets as the +/// very first buffer (i.e. expects this to be a valid variable-length array) +pub fn reencode_offsets(data: &ArrayData) -> (Buffer, usize, usize) { + // first we want to see: what is the offset of this `ArrayData` into the buffer (which is a + // buffer of offsets into the buffer of data) + let orig_offset = data.offset(); + // and also we need to get the buffer of offsets + let offsets_buf = &data.buffers()[0]; + + // then we have to turn it into a typed slice that we can read and manipulate below if + // needed, and slice it according to the size that we need to return + // we need to do `self.len + 1` instead of just `self.len` because the offsets are encoded + // as overlapping pairs - e.g. an array of two items, starting at idx 0, and spanning two + // each, would be encoded as [0, 2, 4]. + let offsets_slice = &offsets_buf.typed_data::()[orig_offset..][..data.len() + 1]; + + // and now we can see what the very first offset and the very last offset is + let start_offset = offsets_slice.first().unwrap(); + let end_offset = offsets_slice.last().unwrap().as_usize(); + + // if the start offset is just 0, i.e. it points to the very beginning of the values of + // this `ArrayData`, then we don't need to shift anything to be a 'correct' offset 'cause + // all the offsets in this buffer are already offset by 0. + // But if it's not 0, then we need to shift them all so that the offsets don't start at + // some weird value. + let (start_offset, offsets) = match start_offset.as_usize() { + 0 => (0, offsets_slice.to_vec().into()), + start => ( + start, + offsets_slice.iter().map(|x| *x - *start_offset).collect(), + ), + }; + + (offsets, start_offset, end_offset - start_offset) +} + /// Similar logic as [`get_byte_array_buffers()`] but slices the child array instead /// of a values buffer. fn get_list_array_buffers(data: &ArrayData) -> (Buffer, ArrayData) { @@ -1314,7 +1381,7 @@ fn get_list_array_buffers(data: &ArrayData) -> (Buffer, Arra ); } - let (offsets, original_start_offset, len) = data.reencode_offsets::(); + let (offsets, original_start_offset, len) = reencode_offsets::(data); let child_data = data.child_data()[0].slice(original_start_offset, len); (offsets, child_data) } @@ -1375,11 +1442,16 @@ fn encode_array_datas( iter.map(|arr| { let arr = arr.borrow(); arr.get_slice_memory_size_with_alignment(Some(write_options.alignment)) - .map(|size| if has_validity_bitmap(arr.data_type(), write_options) && arr.nulls().is_none() { - let null_len = bit_util::ceil(arr.len(), 8); - size + null_len + pad_to_alignment(write_options.alignment, null_len) - } else { - size + .map(|size| { + let didnt_count_nulls = arr.nulls().is_none(); + let will_write_nulls = has_validity_bitmap(arr.data_type(), write_options); + + if will_write_nulls && didnt_count_nulls { + let null_len = bit_util::ceil(arr.len(), 8); + size + null_len + pad_to_alignment(write_options.alignment, null_len) + } else { + size + } }) }) .sum() @@ -1390,67 +1462,79 @@ fn encode_array_datas( let total_size = get_arr_batch_size(arr_datas.iter(), write_options)?; - /*if total_size == 0 { - // If there's nothing of size to encode, then existing tests say we shouldn't even return - // an empty EncodedData - we just return no EncodedData - return Ok(vec![]); - }*/ - let n_batches = bit_util::ceil(total_size, max_msg_size); let mut out = Vec::with_capacity(n_batches); - println!("🐈 n_rows: {n_rows}, n_batches: {n_batches}, header_len: {header_len}, total_size: {total_size}, max_msg_size: {max_msg_size}"); - let mut offset = 0; while offset < n_rows.max(1) { - let length = (1..) + let rows_left = n_rows - offset; + let length = (1..=rows_left) .find(|len| { - println!("🛑 break"); - // If we've exhausted the available length of the array datas, then just return - // we've got it. - if arr_datas.iter().any(|arr| arr.len() < offset + len) { + if offset + len > n_rows { return true; } - let arr_iter = arr_datas.iter().map(|arr| arr.slice(offset, *len)); + let arr_iter = arr_datas + .iter() + // Since we're only returning up above if *all* of the arrays have exhausted + // their available length, it's theoretically possible (though I don't think + // expected or allowed by the invariants that this crate tries to hold) that + // some arrays have different lengths from the others, so we need to call + // `.min(*len)` to ensure we don't get a failed assertion when trying to slice + .map(|arr| arr.slice(offset, arr.len().min(*len))); // we can unwrap this here b/c this only errors on malformed buffer-type/data-type // combinations, and if any of these arrays had that, this function would've // already short-circuited on an earlier call of this function get_arr_batch_size(arr_iter, write_options).unwrap() > max_msg_size }) - // Because this is an unbounded range, this will only panic if we reach usize::MAX and - // still haven't found anything. However, the inner part of this loop slices - // `ArrayData` with the number that we're iterating over, and that will panic if we - // reach usize::MAX because that will be bigger than it can handle to be sliced. - .unwrap() - 1; + // If no rows fit in the given max size, we want to try to get the data across anyways, + // so that just means doing a single row. Calling `max(2)` is how we ensure that - if + // the very first item would go over the max size, giving us a length of 0, we want to + // set this to `2` so that taking away 1 leaves us with one row to encode. + .map(|len| len.max(2) - 1) + // If all rows can comfortably fit in this given size, then just get them all + .unwrap_or(rows_left); let new_arrs = arr_datas .iter() - .map(|arr| arr.slice(offset, length)) + // We could get into a situtation where we were given all 0-row arrays to be sent over + // flight - we do need to send a flight message to show that there is no data, but we + // also can't have `length` be 0 at this point because it could also be that all rows + // are too large to send with the provided limits and so we just want to try to send + // one row anyways, so this is just how we cover our bases there. + .map(|arr| arr.slice(offset, arr.len().min(length))) .collect::>(); - fbb.reset_for_real_run(); - fbb.encode_array_datas( - &new_arrs, - length as i64, - &encode_root, - header_type, - write_options, - )?; + // If we've got more than one row to encode or if we have 0 rows to encode but we haven't + // encoded anything yet, then continue with encoding. We don't need to do encoding, though, + // if we've already encoded some rows and there's no rows left + if length != 0 || offset == 0 { + fbb.reset_for_real_run(); + fbb.encode_array_datas( + &new_arrs, + length as i64, + &encode_root, + header_type, + write_options, + )?; - let finished_data = fbb.fbb.finished_data(); - println!( - "🐈 arrow_data: {}, finished_data: {} (n_rows: {length})", - fbb.arrow_data.len(), - finished_data.len() - ); + let finished_data = fbb.fbb.finished_data(); + + out.push(EncodedData { + ipc_message: finished_data.to_vec(), + arrow_data: fbb.arrow_data.clone(), + }); + } + + // If length == 0, that means they gave us ArrayData with no rows, so a single iteration is + // always sufficient. + if length == 0 { + break; + } - out.push(EncodedData { - ipc_message: finished_data.to_vec(), - arrow_data: fbb.arrow_data.clone(), - }); offset += length; } @@ -1517,10 +1601,6 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { let mut variadic_buffer_counts = Vec::::default(); let mut offset = 0; - // TODO: split these up here to only fit as many as can be fit - // - get `fbb.fbb.unfinished_data` after writing a fake piece of data for `buffers`. that - // determines the real overhead - // - split up the array datas based on how much space they take up alone for array in arr_datas { self.write_array_data( array, @@ -1567,7 +1647,6 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { builder.add_header_type(header_type); builder.add_bodyLength(arrow_len); - // `root` was already tracked when it was turned into a union builder.add_header(root); builder.finish() }; @@ -1589,22 +1668,22 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { .map(TryInto::try_into) .transpose()?; - if !matches!(array_data.data_type(), DataType::Null) { - self.nodes - .push(crate::FieldNode::new(num_rows as i64, null_count as i64)); - } else { - // NullArray's null_count equals to len, but the `null_count` passed in is from ArrayData - // where null_count is always 0. - self.nodes - .push(crate::FieldNode::new(num_rows as i64, num_rows as i64)); - } + // NullArray's null_count equals to len, but the `null_count` passed in is from ArrayData + // where null_count is always 0. + self.nodes.push(crate::FieldNode::new( + num_rows as i64, + match array_data.data_type() { + DataType::Null => num_rows, + _ => null_count, + } as i64, + )); if has_validity_bitmap(array_data.data_type(), write_options) { // write null buffer if exists let null_buffer = match array_data.nulls() { None => { - // create a buffer and fill it with valid bits let num_bytes = bit_util::ceil(num_rows, 8); + // create a buffer and fill it with valid bits MutableBuffer::new(num_bytes) .with_bitset(num_bytes, true) .into() @@ -1612,9 +1691,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { Some(buffer) => buffer.inner().sliced(), }; - println!("🐈 writing null buffer"); self.write_buffer( - null_buffer.as_slice(), + &null_buffer, offset, compression_codec, write_options.alignment, @@ -1622,21 +1700,18 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_byte_array_byffers = |(offsets, values): (Buffer, Buffer)| { - for (idx, buffer) in [offsets, values].into_iter().enumerate() { - println!("🐈 writing buf {idx} of tuple"); + for buffer in [offsets, values] { self.write_buffer(&buffer, offset, compression_codec, write_options.alignment)?; } Ok::<_, ArrowError>(()) }; - println!("🐈 writing array dt: {:?}", array_data.data_type()); - match array_data.data_type() { DataType::Binary | DataType::Utf8 => { - write_byte_array_byffers(array_data.get_byte_array_buffers::())? + write_byte_array_byffers(get_byte_array_buffers::(array_data))? } DataType::LargeBinary | DataType::LargeUtf8 => { - write_byte_array_byffers(array_data.get_byte_array_buffers::())? + write_byte_array_byffers(get_byte_array_buffers::(array_data))? } dt if DataType::is_numeric(dt) || DataType::is_temporal(dt) @@ -1693,12 +1768,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { DataType::LargeList(_) => get_list_array_buffers::(array_data), _ => unreachable!(), }; - self.write_buffer( - offsets.as_slice(), - offset, - compression_codec, - write_options.alignment, - )?; + self.write_buffer(&offsets, offset, compression_codec, write_options.alignment)?; self.write_array_data( &sliced_child_data, offset, @@ -1724,8 +1794,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_arr = |arr: &ArrayData| { - for (idx, data_ref) in arr.child_data().iter().enumerate() { - println!("🐈 writing child array {idx}"); + for data_ref in arr.child_data() { self.write_array_data( data_ref, offset, @@ -1766,8 +1835,6 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { compression_codec: Option, alignment: u8, ) -> Result<(), ArrowError> { - let start_len = self.arrow_data.len(); - let len: i64 = if self.dry_run { // Flatbuffers will essentially optimize this away if we say the len is 0 for all of // these, so to make sure the header size is the same in the dry run and in the real @@ -1795,8 +1862,6 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { let pad_len = pad_to_alignment(alignment, len as usize); self.arrow_data.extend_from_slice(&PADDING[..pad_len]); - println!("🐈 writing buffer of size {} - arrow_data went {} => {}", buffer.len(), start_len, self.arrow_data.len()); - *offset += len + (pad_len as i64); Ok(()) } From aa959b71e37201d7d29057b976bf5322ec0cdbef Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 15:38:50 -0700 Subject: [PATCH 05/20] Update previously-commented tests and remove unused parameters --- arrow-flight/src/encode.rs | 133 +++++++++++-------------------------- parquet-testing | 2 +- 2 files changed, 41 insertions(+), 94 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 7b14c9d18d20..c9d7f9401ee8 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -664,6 +664,7 @@ fn hydrate_dictionary(array: &ArrayRef, data_type: &DataType) -> Result) -> Vec { + FlightDataDecoder::new( + futures::stream::iter( + std::iter::once(SchemaAsIpc::new(&schema, &IpcWriteOptions::default()).into()) + .chain(encoded.into_iter().map(FlightData::from)) + .map(Ok) + ) + ).collect::>>() + .await + .into_iter() + .map(|r| r.unwrap()) + .filter_map(|data| match data.payload { + DecodedPayload::RecordBatch(rb) => Some(rb), + _ => None + }) + .collect() + } + let max_flight_data_size = 1024; let write_opts = IpcWriteOptions::default(); let mut dict_tracker = DictionaryTracker::new(false); @@ -1497,7 +1516,7 @@ mod tests { .unwrap() .1; assert_eq!(split.len(), 1); - // assert_eq!(batch, split[0]); + assert_eq!(batch, get_decoded(batch.schema(), split).await[0]); // split once let n_rows = max_flight_data_size + 1; @@ -1510,63 +1529,16 @@ mod tests { .unwrap() .1; assert_eq!(split.len(), 2); - /*assert_eq!( - split.iter().map(|batch| batch.num_rows()).sum::(), + let batches = get_decoded(batch.schema(), split).await; + assert_eq!( + batches.iter().map(RecordBatch::num_rows).sum::(), n_rows ); - let a = pretty_format_batches(&split).unwrap().to_string(); + let a = pretty_format_batches(&batches).unwrap().to_string(); let b = pretty_format_batches(&[batch]).unwrap().to_string(); - assert_eq!(a, b);*/ - } - - // TESTS TO DO: - // - Make sure batches with Utf8View/BinaryView are properly accounted for and don't overflow - // too much - // - Modify `test_split_batch_for_grpc_response` to read the `EncodedData` back into - // `RecordBatches` and compare those - // - Ensure No record batches overflow their alloted size. - - /*#[test] - fn test_split_batch_for_grpc_response_sizes() { - // 2000 8 byte entries into 2k pieces: 8 chunks of 250 rows - verify_split(2000, 2 * 1024, vec![250, 250, 250, 250, 250, 250, 250, 250]); - - // 2000 8 byte entries into 4k pieces: 4 chunks of 500 rows - verify_split(2000, 4 * 1024, vec![500, 500, 500, 500]); - - // 2023 8 byte entries into 3k pieces does not divide evenly - verify_split(2023, 3 * 1024, vec![337, 337, 337, 337, 337, 337, 1]); - - // 10 8 byte entries into 1 byte pieces means each rows gets its own - verify_split(10, 1, vec![1, 1, 1, 1, 1, 1, 1, 1, 1, 1]); - - // 10 8 byte entries into 1k byte pieces means one piece - verify_split(10, 1024, vec![10]); + assert_eq!(a, b); } - /// Creates a UInt64Array of 8 byte integers with input_rows rows - /// `max_flight_data_size_bytes` pieces and verifies the row counts in - /// those pieces - fn verify_split( - num_input_rows: u64, - max_flight_data_size_bytes: usize, - expected_sizes: Vec, - ) { - let array: UInt64Array = (0..num_input_rows).collect(); - - let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(array) as ArrayRef)]) - .expect("cannot create record batch"); - - let input_rows = batch.num_rows(); - - let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); - let sizes: Vec<_> = split.iter().map(RecordBatch::num_rows).collect(); - let output_rows: usize = sizes.iter().sum(); - - assert_eq!(sizes, expected_sizes, "mismatch for {batch:?}"); - assert_eq!(input_rows, output_rows, "mismatch for {batch:?}"); - }*/ - // test sending record batches // test sending record batches with multiple different dictionaries @@ -1585,7 +1557,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1594,9 +1566,7 @@ mod tests { let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); let batch = RecordBatch::try_from_iter([("data", Arc::new(array) as _)]).unwrap(); - // overage is much higher than ideal - // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1629,10 +1599,7 @@ mod tests { ]) .unwrap(); - // 5k over limit (which is 2x larger than limit of 5k) - // overage is much higher than ideal - // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1648,7 +1615,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1660,9 +1627,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - // overage is much higher than ideal - // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1674,9 +1639,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - // overage is much higher than ideal - // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } #[tokio::test] @@ -1699,9 +1662,7 @@ mod tests { ]) .unwrap(); - // overage is much higher than ideal - // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 0).await; + verify_encoded_split_no_overhead(batch).await; } /// Return size, in memory of flight data @@ -1733,12 +1694,9 @@ mod tests { /// Note this overhead will likely always be greater than zero to /// account for encoding overhead such as IPC headers and padding. /// - async fn verify_encoded_split(batch: RecordBatch, allowed_overage: usize) { + async fn verify_encoded_split_no_overhead(batch: RecordBatch) { let num_rows = batch.num_rows(); - // Track the overall required maximum overage - let mut max_overage_seen = 0; - for max_flight_data_size in [1024, 2021, 5000] { println!("Encoding {num_rows} with a maximum size of {max_flight_data_size}"); @@ -1764,26 +1722,15 @@ mod tests { // of results over multiple messages, so we allow going over the limit if it's the // bare minimum over (1 row) if !is_1_row { - assert!( - actual_overage <= allowed_overage, - "encoded data[{i}]: actual size {actual_data_size}, \ - actual_overage: {actual_overage} \ - allowed_overage: {allowed_overage}" + assert_eq!( + actual_overage, + 0, + "encoded data[{i}]: actual size {actual_data_size}, actual_overage: {actual_overage}" ); - - max_overage_seen = max_overage_seen.max(actual_overage); } i += 1; } } - - // ensure that the specified overage is exactly the maxmium so - // that when the splitting logic improves, the tests must be - // updated to reflect the better logic - assert_eq!( - allowed_overage, max_overage_seen, - "Specified overage was too high" - ); } } diff --git a/parquet-testing b/parquet-testing index 550368ca77b9..50af3d8ce206 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 550368ca77b97231efead39251a96bd6f8f08c6e +Subproject commit 50af3d8ce206990d81014b1862e5ce7380dc3e08 From 86772b38c2630c0a6129c856cdd395a04aa0ab17 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 16:23:34 -0700 Subject: [PATCH 06/20] Improve documentation with new, and more correct, comments --- arrow-data/src/data.rs | 38 ++++++++++++++++---------------------- arrow-ipc/src/writer.rs | 15 +++++++++++---- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 2537378a9336..e6f49660d6b2 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -423,14 +423,26 @@ impl ArrayData { size } + /// Returns the total number of the bytes of memory occupied by the buffers by this slice of + /// [`ArrayData`] (See also diagram on [`ArrayData`]). + /// + /// This is approximately the number of bytes if a new [`ArrayData`] was formed by creating new + /// [`Buffer`]s with exactly the data needed. + /// + /// For example, a [`DataType::Int64`] with `100` elements, [`Self::get_slice_memory_size`] + /// would return `100 * 8 = 800`. If the [`ArrayData`] was then [`Self::slice`]d to refer to + /// its first `20` elements, then [`Self::get_slice_memory_size`] on the sliced [`ArrayData`] + /// would return `20 * 8 = 160`. + /// + /// The `alignment` parameter is used to add padding to each buffer being counted, to ensure + /// the size for each one is aligned to `alignment` bytes (if it is `Some`) pub fn get_slice_memory_size_with_alignment( &self, alignment: Option, ) -> Result { let layout = layout(&self.data_type); - // TODO: I pulled this from arrow-ipc, we should extract it to something where they can - // share code + // Just pulled from arrow-ipc #[inline] fn pad_to_alignment(alignment: u8, len: usize) -> usize { let a = usize::from(alignment - 1); @@ -468,14 +480,6 @@ impl ArrayData { self.typed_offsets::() .map(|off| (off[self.len] - off[0]) as usize) } - // `FlatBufferSizeTracker::write_array_data` just writes these just as raw - // buffer data as opposed to using offsets or anything - DataType::BinaryView | DataType::Utf8View => Ok( - self.buffers() - .iter() - .map(|buf| buf.len()) - .sum::() - ), dt => Err(ArrowError::NotYetImplemented(format!( "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {dt}", ))), @@ -500,18 +504,8 @@ impl ArrayData { Ok(result) } - /// Returns the total number of the bytes of memory occupied by - /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). - /// - /// This is approximately the number of bytes if a new - /// [`ArrayData`] was formed by creating new [`Buffer`]s with - /// exactly the data needed. - /// - /// For example, a [`DataType::Int64`] with `100` elements, - /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If - /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its - /// first `20` elements, then [`Self::get_slice_memory_size`] on the - /// sliced [`ArrayData`] would return `20 * 8 = 160`. + /// Equivalent to calling [`Self::get_slice_memory_size_with_alignment()`] with `None` for the + /// alignment pub fn get_slice_memory_size(&self) -> Result { self.get_slice_memory_size_with_alignment(None) } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index af188baf9014..0f188eba4c85 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -454,9 +454,6 @@ impl IpcDataGenerator { Ok(()) } - /// Encodes a batch to a number of [EncodedData] items (dictionary batches + the record batch). - /// The [DictionaryTracker] keeps track of dictionaries with new `dict_id`s (so they are only sent once) - /// Make sure the [DictionaryTracker] is initialized at the start of the stream. pub fn encoded_batch( &self, batch: &RecordBatch, @@ -469,12 +466,22 @@ impl IpcDataGenerator { assert_eq!( encoded_messages.len(), 1, - "encoded_batch with max size of usize::MAX should not be able to return more than 1 batch" + "encoded_batch with max size of usize::MAX should not be able to return more or less than 1 batch" ); Ok((encoded_dictionaries, encoded_messages.pop().unwrap())) } + /// Encodes a batch to a number of [EncodedData] items (dictionary batches + the record batch). + /// The [DictionaryTracker] keeps track of dictionaries with new `dict_id`s (so they are only sent once) + /// Make sure the [DictionaryTracker] is initialized at the start of the stream. + /// The `max_flight_data_size` is used to control how much space each encoded [`RecordBatch`] is + /// allowed to take up. + /// + /// Each [`EncodedData`] in the second element of the returned tuple will be smaller than + /// `max_flight_data_size` bytes, if possible at all. However, this API has no support for + /// splitting rows into multiple [`EncodedData`]s, so if a row is larger, by itself, than + /// `max_flight_data_size`, it will be encoded to a message which is larger than the limit. pub fn encoded_batch_with_size( &self, batch: &RecordBatch, From 9673c7010f814584d5dd99353b271a347dcd9f02 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 16:27:41 -0700 Subject: [PATCH 07/20] Add more documentation and format --- arrow-flight/src/encode.rs | 32 +++++++++++++++----------------- arrow-ipc/src/writer.rs | 4 ++++ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index c9d7f9401ee8..3f94a882c0d3 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -664,9 +664,8 @@ fn hydrate_dictionary(array: &ArrayRef, data_type: &DataType) -> Result) -> Vec { - FlightDataDecoder::new( - futures::stream::iter( - std::iter::once(SchemaAsIpc::new(&schema, &IpcWriteOptions::default()).into()) - .chain(encoded.into_iter().map(FlightData::from)) - .map(Ok) - ) - ).collect::>>() - .await - .into_iter() - .map(|r| r.unwrap()) - .filter_map(|data| match data.payload { - DecodedPayload::RecordBatch(rb) => Some(rb), - _ => None - }) - .collect() + FlightDataDecoder::new(futures::stream::iter( + std::iter::once(SchemaAsIpc::new(&schema, &IpcWriteOptions::default()).into()) + .chain(encoded.into_iter().map(FlightData::from)) + .map(Ok), + )) + .collect::>>() + .await + .into_iter() + .map(|r| r.unwrap()) + .filter_map(|data| match data.payload { + DecodedPayload::RecordBatch(rb) => Some(rb), + _ => None, + }) + .collect() } let max_flight_data_size = 1024; diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 0f188eba4c85..c0db27c6acc3 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -454,6 +454,10 @@ impl IpcDataGenerator { Ok(()) } + /// Calls [`Self::encoded_batch_with_size`] with no limit, returning the first (and only) + /// [`EncodedData`] that is produced. This method should be used over + /// [`Self::encoded_batch_with_size`] if the consumer has no concerns about encoded message + /// size limits pub fn encoded_batch( &self, batch: &RecordBatch, From cf5c129b825f70fd2163f0c454d227f5fbcee6ef Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 18:28:49 -0700 Subject: [PATCH 08/20] Remove old over-cautious 'min' check --- arrow-data/src/data.rs | 2 +- arrow-ipc/src/writer.rs | 36 +++++++++++++++++------------------- parquet-testing | 2 +- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index e6f49660d6b2..3a3f20c00b72 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -559,7 +559,7 @@ impl ArrayData { child_data: self .child_data() .iter() - .map(|data| data.slice(offset, length.min(data.len()))) + .map(|data| data.slice(offset, length)) .collect(), nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index c0db27c6acc3..af2e3c5539c4 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1478,6 +1478,16 @@ fn encode_array_datas( let mut offset = 0; while offset < n_rows.max(1) { + let slice_arrays = |len: usize| { + arr_datas.iter().map(move |arr| { + if len >= arr.len() { + arr.clone() + } else { + arr.slice(offset, len) + } + }) + }; + let rows_left = n_rows - offset; let length = (1..=rows_left) .find(|len| { @@ -1487,19 +1497,10 @@ fn encode_array_datas( return true; } - let arr_iter = arr_datas - .iter() - // Since we're only returning up above if *all* of the arrays have exhausted - // their available length, it's theoretically possible (though I don't think - // expected or allowed by the invariants that this crate tries to hold) that - // some arrays have different lengths from the others, so we need to call - // `.min(*len)` to ensure we don't get a failed assertion when trying to slice - .map(|arr| arr.slice(offset, arr.len().min(*len))); - // we can unwrap this here b/c this only errors on malformed buffer-type/data-type // combinations, and if any of these arrays had that, this function would've // already short-circuited on an earlier call of this function - get_arr_batch_size(arr_iter, write_options).unwrap() > max_msg_size + get_arr_batch_size(slice_arrays(*len), write_options).unwrap() > max_msg_size }) // If no rows fit in the given max size, we want to try to get the data across anyways, // so that just means doing a single row. Calling `max(2)` is how we ensure that - if @@ -1509,15 +1510,12 @@ fn encode_array_datas( // If all rows can comfortably fit in this given size, then just get them all .unwrap_or(rows_left); - let new_arrs = arr_datas - .iter() - // We could get into a situtation where we were given all 0-row arrays to be sent over - // flight - we do need to send a flight message to show that there is no data, but we - // also can't have `length` be 0 at this point because it could also be that all rows - // are too large to send with the provided limits and so we just want to try to send - // one row anyways, so this is just how we cover our bases there. - .map(|arr| arr.slice(offset, arr.len().min(length))) - .collect::>(); + // We could get into a situtation where we were given all 0-row arrays to be sent over + // flight - we do need to send a flight message to show that there is no data, but we also + // can't have `length` be 0 at this point because it could also be that all rows are too + // large to send with the provided limits and so we just want to try to send one now + // anyways, so the checks in this fn are just how we cover our bases there. + let new_arrs = slice_arrays(length).collect::>(); // If we've got more than one row to encode or if we have 0 rows to encode but we haven't // encoded anything yet, then continue with encoding. We don't need to do encoding, though, diff --git a/parquet-testing b/parquet-testing index 50af3d8ce206..550368ca77b9 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 50af3d8ce206990d81014b1862e5ce7380dc3e08 +Subproject commit 550368ca77b97231efead39251a96bd6f8f08c6e From 114b31f66c84c6973c375747de21598df6069559 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Mon, 4 Nov 2024 18:46:44 -0700 Subject: [PATCH 09/20] Small comment additions, fn name cleanups, and panic message clarifications --- arrow-data/src/data.rs | 1 - arrow-flight/src/encode.rs | 16 ++++++++-------- arrow-flight/src/utils.rs | 2 +- arrow-ipc/src/writer.rs | 19 ++++++++++++++++--- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 3a3f20c00b72..5f6778dc4946 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -899,7 +899,6 @@ impl ArrayData { let required_len = (len + self.offset) * mem::size_of::(); if buffer.len() < required_len { - // panic!("just get it over with"); return Err(ArrowError::InvalidArgumentError(format!( "Buffer {} of {} isn't large enough. Expected {} bytes got {}", idx, diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 3f94a882c0d3..731a95edf1f7 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1555,7 +1555,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1564,7 +1564,7 @@ mod tests { let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); let batch = RecordBatch::try_from_iter([("data", Arc::new(array) as _)]).unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1597,7 +1597,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1613,7 +1613,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1625,7 +1625,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1637,7 +1637,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } #[tokio::test] @@ -1660,7 +1660,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split_no_overhead(batch).await; + verify_encoded_split_no_overage(batch).await; } /// Return size, in memory of flight data @@ -1692,7 +1692,7 @@ mod tests { /// Note this overhead will likely always be greater than zero to /// account for encoding overhead such as IPC headers and padding. /// - async fn verify_encoded_split_no_overhead(batch: RecordBatch) { + async fn verify_encoded_split_no_overage(batch: RecordBatch) { let num_rows = batch.num_rows(); for max_flight_data_size in [1024, 2021, 5000] { diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index e2ffeecaae93..9642e75fd0e1 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -43,7 +43,7 @@ pub fn flight_data_from_arrow_batch( assert_eq!( flight_batches.len(), 1, - "encoded_batch with a max size of usize::MAX should never return more than 1 batch" + "encoded_batch with a max size of usize::MAX should not be able to return more or less than 1 batch" ); let flight_batch = flight_batches.pop().unwrap(); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index af2e3c5539c4..c302ff67212e 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -422,7 +422,7 @@ impl IpcDataGenerator { dict_id_seq, )?; - // It's importnat to only take the dict_id at this point, because the dict ID + // It's important to only take the dict_id at this point, because the dict ID // sequence is assigned depth-first, so we need to first encode children and have // them take their assigned dict IDs before we take the dict ID for this field. let dict_id = dict_id_seq @@ -524,7 +524,6 @@ impl IpcDataGenerator { write_options: &IpcWriteOptions, ) -> Result { let mut encoded_datas = encode_array_datas( - // TODO: We can abstract this clone away, right? &[array_data.clone()], array_data.len(), |fbb, offset| { @@ -542,7 +541,7 @@ impl IpcDataGenerator { assert_eq!( encoded_datas.len(), 1, - "encode_array_datas with usize::MAX as the max size should never return more than a single item" + "encode_array_datas with a max size of usize::MAX should not be able to return more or less than 1 batch" ); Ok(encoded_datas.pop().unwrap()) @@ -1550,6 +1549,16 @@ fn encode_array_datas( Ok(out) } +/// A struct to help ensure that the size of encoded flight messages never goes over a provided +/// limit (except in ridiculous cases like a limit of 1 byte). The way it does so is by first +/// running through a provided slice of [`ArrayData`], producing an IPC header message for that +/// slice, and then subtracting the size of that generated header from the message limit it has +/// been given. Because IPC header message sizes don't change due to a different amount of rows, +/// this header size will stay consistent throughout the entire time that we have to transmit a +/// chunk of rows, so we can just subtract it from the overall limit and use that to check +/// different slices of `ArrayData` against to know how many to transmit each time. +/// +/// This whole process is done in [`encode_array_datas()`] above #[derive(Default)] struct FlatBufferSizeTracker<'fbb> { // the builder and backing flatbuffer that we use to write the arrow data into. @@ -1564,6 +1573,8 @@ struct FlatBufferSizeTracker<'fbb> { } impl<'fbb> FlatBufferSizeTracker<'fbb> { + /// Preferred initializer, as this should always be used with a dry-run before a real run to + /// figure out the size of the IPC header. #[must_use] fn for_dry_run(capacity: usize) -> Self { Self { @@ -1574,6 +1585,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } } + /// Should be called in-between calls to `encode_array_datas` to ensure we don't accidentally + /// keep & encode old data each time. fn reset_for_real_run(&mut self) { self.fbb.reset(); self.buffers.clear(); From 1acc9027f5fc6f6723d9a4193fdc5569fcc1dd94 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 5 Nov 2024 10:29:05 -0700 Subject: [PATCH 10/20] Add more notes for self --- arrow-flight/src/encode.rs | 4 ++++ arrow-ipc/src/writer.rs | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 731a95edf1f7..940d561074e7 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -681,6 +681,10 @@ mod tests { use super::*; + // TESTS TO ADD: + // - Ensure ArrayData::get_slice_memory_size_with_alignment returns the same data size as IPC + // encoding it + #[test] // flight_data_from_arrow_batch is deprecated but does exactly what we need. Would probably be // good to just move it to this test mod when it's due to be removed. diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index c302ff67212e..2d7681f7921d 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1488,6 +1488,10 @@ fn encode_array_datas( }; let rows_left = n_rows - offset; + // TODO? maybe this could be more efficient by continually approximating the maximum number + // of rows based on (size / n_rows) of the current ArrayData slice until we've found the + // maximum that can fit? e.g. 'oh, it's 200 bytes and 10 rows, so each row is probably 20 + // bytes - let's do (max_size / 20) rows and see if that fits' let length = (1..=rows_left) .find(|len| { // If we've exhausted the available length of the array datas, then just return - From ffad55685d627a59f9ce6747083364c6c661949a Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 5 Nov 2024 14:17:27 -0700 Subject: [PATCH 11/20] Fix for 1.77 msrv --- arrow-flight/src/encode.rs | 2 +- .../src/flight_client_scenarios/integration_test.rs | 2 +- .../src/flight_server_scenarios/integration_test.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 940d561074e7..f4a39f866509 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -688,7 +688,7 @@ mod tests { #[test] // flight_data_from_arrow_batch is deprecated but does exactly what we need. Would probably be // good to just move it to this test mod when it's due to be removed. - #[expect(deprecated)] + #[allow(deprecated)] /// ensure only the batch's used data (not the allocated data) is sent /// fn test_encode_flight_data() { diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 3b0850ed883d..7882e6539c46 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -125,7 +125,7 @@ async fn send_batch( batch: &RecordBatch, options: &writer::IpcWriteOptions, ) -> Result { - #[expect(deprecated)] + #[allow(deprecated)] let (dictionary_flight_data, mut batch_flight_data) = arrow_flight::utils::flight_data_from_arrow_batch(batch, options); diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index cbb8de6f0f2e..524339ecc389 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -127,7 +127,7 @@ impl FlightService for FlightServiceImpl { .iter() .enumerate() .flat_map(|(counter, batch)| { - #[expect(deprecated)] + #[allow(deprecated)] let (dictionary_flight_data, mut batch_flight_data) = arrow_flight::utils::flight_data_from_arrow_batch(batch, &options); From e090daed970ad302dda4d93a618315dfe700cced Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 5 Nov 2024 14:37:46 -0700 Subject: [PATCH 12/20] Add test (that currently fails) to make sure everything is as expected --- arrow-ipc/src/writer.rs | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 2d7681f7921d..1a813734a02e 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -2935,4 +2935,47 @@ mod tests { assert_eq!(stream_bytes_written_on_flush, expected_stream_flushed_bytes); assert_eq!(file_bytes_written_on_flush, expected_file_flushed_bytes); } + + #[test] + fn encoded_arr_data_same_size_as_compute_api() { + fn encode_test(arr: T) { + println!("Checking arr {arr:?}"); + + let arr_data = arr.to_data(); + + let write_options = IpcWriteOptions { + batch_compression_type: None, + ..IpcWriteOptions::default() + }; + + let compute_size = arr_data + .get_slice_memory_size_with_alignment(Some(write_options.alignment)) + .unwrap(); + let num_rows = arr_data.len(); + + let encoded = encode_array_datas( + &[arr_data], + num_rows, + |_, root| root.as_union_value(), + MessageHeader::RecordBatch, + usize::MAX, + &write_options, + ) + .unwrap() + .pop() + .unwrap(); + + assert_eq!(compute_size, encoded.arrow_data.len()); + } + + let str_arr = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect::(); + let int_arr = [None, Some(200), None, Some(2), Some(-4)] + .into_iter() + .collect::(); + encode_test(str_arr.clone()); + encode_test(int_arr.clone()); + encode_test(DictionaryArray::new(int_arr, Arc::new(str_arr))); + } } From 910a69e2f54c96e34b5b0f44dd73312a2c983b10 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 5 Nov 2024 17:11:37 -0700 Subject: [PATCH 13/20] Make test pass and add comment about why one previous case doesn't pass --- .../src/array/fixed_size_list_array.rs | 3 +- arrow-data/src/data.rs | 4 + arrow-ipc/src/writer.rs | 77 +++++++++++-------- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 37446e234522..a972eb32fe80 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -159,8 +159,7 @@ impl FixedSizeListArray { if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( - "Incorrect length of null buffer for FixedSizeListArray, expected {} got {}", - len, + "Incorrect length of null buffer for FixedSizeListArray, expected {len} got {}", n.len(), ))); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 5f6778dc4946..32547c8750fa 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -440,6 +440,10 @@ impl ArrayData { &self, alignment: Option, ) -> Result { + // Note: This accounts for data used by the Dictionary DataType that isn't actually encoded + // as a part of `write_array_data` in arrow-ipc - specifically, the `values` part of + // each Dictionary are encoded in the `child_data` of the `ArrayData` it produces, but (for + // some reason that I don't fully understand) it doesn't encode those values. hmm. let layout = layout(&self.data_type); // Just pulled from arrow-ipc diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 1a813734a02e..d89315c8ada4 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1425,6 +1425,29 @@ fn chunked_encoded_batch_bytes( ) } +fn get_encoded_arr_batch_size>( + iter: impl IntoIterator, + write_options: &IpcWriteOptions, +) -> Result { + iter.into_iter() + .map(|arr| { + let arr = arr.borrow(); + arr.get_slice_memory_size_with_alignment(Some(write_options.alignment)) + .map(|size| { + let didnt_count_nulls = arr.nulls().is_none(); + let will_write_nulls = has_validity_bitmap(arr.data_type(), write_options); + + if will_write_nulls && didnt_count_nulls { + let null_len = bit_util::ceil(arr.len(), 8); + size + null_len + pad_to_alignment(write_options.alignment, null_len) + } else { + size + } + }) + }) + .sum() +} + fn encode_array_datas( arr_datas: &[ArrayData], n_rows: usize, @@ -1445,32 +1468,10 @@ fn encode_array_datas( write_options, )?; - fn get_arr_batch_size>( - iter: impl Iterator, - write_options: &IpcWriteOptions, - ) -> Result { - iter.map(|arr| { - let arr = arr.borrow(); - arr.get_slice_memory_size_with_alignment(Some(write_options.alignment)) - .map(|size| { - let didnt_count_nulls = arr.nulls().is_none(); - let will_write_nulls = has_validity_bitmap(arr.data_type(), write_options); - - if will_write_nulls && didnt_count_nulls { - let null_len = bit_util::ceil(arr.len(), 8); - size + null_len + pad_to_alignment(write_options.alignment, null_len) - } else { - size - } - }) - }) - .sum() - } - let header_len = fbb.fbb.finished_data().len(); max_msg_size = max_msg_size.saturating_sub(header_len).max(1); - let total_size = get_arr_batch_size(arr_datas.iter(), write_options)?; + let total_size = get_encoded_arr_batch_size(arr_datas.iter(), write_options)?; let n_batches = bit_util::ceil(total_size, max_msg_size); let mut out = Vec::with_capacity(n_batches); @@ -1503,7 +1504,8 @@ fn encode_array_datas( // we can unwrap this here b/c this only errors on malformed buffer-type/data-type // combinations, and if any of these arrays had that, this function would've // already short-circuited on an earlier call of this function - get_arr_batch_size(slice_arrays(*len), write_options).unwrap() > max_msg_size + get_encoded_arr_batch_size(slice_arrays(*len), write_options).unwrap() + > max_msg_size }) // If no rows fit in the given max size, we want to try to get the data across anyways, // so that just means doing a single row. Calling `max(2)` is how we ensure that - if @@ -1861,6 +1863,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { compression_codec: Option, alignment: u8, ) -> Result<(), ArrowError> { + let start_len = self.arrow_data.len(); + let len: i64 = if self.dry_run { // Flatbuffers will essentially optimize this away if we say the len is 0 for all of // these, so to make sure the header size is the same in the dry run and in the real @@ -2948,9 +2952,7 @@ mod tests { ..IpcWriteOptions::default() }; - let compute_size = arr_data - .get_slice_memory_size_with_alignment(Some(write_options.alignment)) - .unwrap(); + let compute_size = get_encoded_arr_batch_size([&arr_data], &write_options).unwrap(); let num_rows = arr_data.len(); let encoded = encode_array_datas( @@ -2968,14 +2970,29 @@ mod tests { assert_eq!(compute_size, encoded.arrow_data.len()); } - let str_arr = [Some("foo"), Some("bar"), Some("baz")] + let str_arr = [Some("fooo"), Some("ba"), Some("bazrrrrrrrrr"), Some("quz")] .into_iter() .collect::(); - let int_arr = [None, Some(200), None, Some(2), Some(-4)] + let int_arr = [None, Some(2), Some(1), Some(3)] .into_iter() .collect::(); encode_test(str_arr.clone()); encode_test(int_arr.clone()); - encode_test(DictionaryArray::new(int_arr, Arc::new(str_arr))); + + // For some reason, DictionaryArrays don't encode their `values` the flight messages. I + // don't know why that is, but that will cause this test to fail. + // encode_test(DictionaryArray::new(int_arr, Arc::new(str_arr))); + + let time_arr = [Some(0), Some(14000), Some(-1), Some(-1)] + .into_iter() + .collect::(); + encode_test(time_arr); + + let list_field: FieldRef = Arc::new(Field::new("a", DataType::Int32, true)); + let all_null_list = FixedSizeListArray::new_null(Arc::clone(&list_field), 3, 8); + encode_test(all_null_list); + + let list = FixedSizeListArray::new(list_field, 2, make_array(int_arr.to_data()), None); + encode_test(list); } } From 8de27750bf12e240172f8ffcef2227b059724e39 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 5 Nov 2024 17:26:34 -0700 Subject: [PATCH 14/20] Fix unused variable --- arrow-ipc/src/writer.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index d89315c8ada4..dd3faf7c884b 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1863,8 +1863,6 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { compression_codec: Option, alignment: u8, ) -> Result<(), ArrowError> { - let start_len = self.arrow_data.len(); - let len: i64 = if self.dry_run { // Flatbuffers will essentially optimize this away if we say the len is 0 for all of // these, so to make sure the header size is the same in the dry run and in the real From c1890d435c2f8d3a8dd9477e0a353e476493d987 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 7 Nov 2024 08:58:20 -0700 Subject: [PATCH 15/20] Update old inaccurate comments about test coverage --- arrow-flight/src/encode.rs | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index f4a39f866509..53db10f9ce45 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -681,10 +681,6 @@ mod tests { use super::*; - // TESTS TO ADD: - // - Ensure ArrayData::get_slice_memory_size_with_alignment returns the same data size as IPC - // encoding it - #[test] // flight_data_from_arrow_batch is deprecated but does exactly what we need. Would probably be // good to just move it to this test mod when it's due to be removed. @@ -1684,17 +1680,11 @@ mod tests { /// Coverage for /// - /// Encodes the specified batch using several values of - /// `max_flight_data_size` between 1K to 5K and ensures that the - /// resulting size of the flight data stays within the limit - /// + `allowed_overage` - /// - /// `allowed_overage` is how far off the actual data encoding is - /// from the target limit that was set. It is an improvement when - /// the allowed_overage decreses. - /// - /// Note this overhead will likely always be greater than zero to - /// account for encoding overhead such as IPC headers and padding. + /// Encodes the specified batch using several values of `max_flight_data_size` between 1K to 5K + /// and ensures that the resulting size of the flight data stays within the limit, except for + /// in cases where only 1 row is sent - if only 1 row is sent, then we know that there was no + /// way to keep the data within the limit (since the minimum possible amount of data was sent), + /// so we allow it to go over. /// async fn verify_encoded_split_no_overage(batch: RecordBatch) { let num_rows = batch.num_rows(); From 842ea66ea4752ca34a0a070caabf5447f255ff4b Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 7 Nov 2024 10:56:35 -0700 Subject: [PATCH 16/20] Improve size calculations for RunEndEncoded data type and add test coverage for it --- arrow-array/src/array/run_array.rs | 19 ++++++---- arrow-ipc/src/reader.rs | 2 +- arrow-ipc/src/writer.rs | 57 ++++++++++++++++++++++++------ 3 files changed, 60 insertions(+), 18 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 81c8cdcea4d3..e1d532ff4003 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -254,14 +254,19 @@ impl RunArray { } impl From for RunArray { - // The method assumes the caller already validated the data using `ArrayData::validate_data()` fn from(data: ArrayData) -> Self { - match data.data_type() { - DataType::RunEndEncoded(_, _) => {} - _ => { - panic!("Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"); - } - } + Self::from(&data) + } +} + +impl From<&ArrayData> for RunArray { + // The method assumes the caller already validated the data using `ArrayData::validate_data()` + fn from(data: &ArrayData) -> Self { + let DataType::RunEndEncoded(_, _) = data.data_type() else { + panic!( + "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded" + ); + }; // Safety // ArrayData is valid diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 3950f0f1ce02..d9d157b72606 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1785,7 +1785,7 @@ mod tests { // can be compared as such. assert_eq!(input_batch.column(1), output_batch.column(1)); - let run_array_1_unsliced = unslice_run_array(run_array_1_sliced.into_data()).unwrap(); + let run_array_1_unsliced = unslice_run_array(&run_array_1_sliced.into_data()).unwrap(); assert_eq!(run_array_1_unsliced, output_batch.column(0).into_data()); } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index dd3faf7c884b..4970a45fe470 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -567,7 +567,7 @@ fn append_variadic_buffer_counts(counts: &mut Vec, array: &ArrayData) { } } -pub(crate) fn unslice_run_array(arr: ArrayData) -> Result { +pub(crate) fn unslice_run_array(arr: &ArrayData) -> Result { match arr.data_type() { DataType::RunEndEncoded(k, _) => match k.data_type() { DataType::Int16 => { @@ -1433,16 +1433,40 @@ fn get_encoded_arr_batch_size>( .map(|arr| { let arr = arr.borrow(); arr.get_slice_memory_size_with_alignment(Some(write_options.alignment)) - .map(|size| { + .and_then(|mut size| { let didnt_count_nulls = arr.nulls().is_none(); let will_write_nulls = has_validity_bitmap(arr.data_type(), write_options); if will_write_nulls && didnt_count_nulls { let null_len = bit_util::ceil(arr.len(), 8); - size + null_len + pad_to_alignment(write_options.alignment, null_len) - } else { - size + size += null_len + pad_to_alignment(write_options.alignment, null_len) } + + // TODO: This is ugly. We remove the child_data size in RunEndEncoded because + // it was calculated as the size existing in memory but we care about the size + // when it's decoded and then encoded into a flatbuffer. Afaik, this is the + // only data type where the size in memory is not the same size as when encoded + // (since it has a different representation in memory), so it's not horrible, + // but it's definitely not ideal. + if let DataType::RunEndEncoded(_, _) = arr.data_type() { + size -= arr + .child_data() + .iter() + .map(|data| { + data.get_slice_memory_size_with_alignment(Some( + write_options.alignment, + )) + }) + .sum::>()?; + + size += unslice_run_array(arr)? + .child_data() + .iter() + .map(|data| get_encoded_arr_batch_size([data], write_options)) + .sum::>()?; + } + + Ok(size) }) }) .sum() @@ -1837,7 +1861,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { match array_data.data_type() { DataType::Dictionary(_, _) => Ok(()), // unslice the run encoded array. - DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data.clone())?), + DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data)?), // recursively write out nested structures _ => write_arr(array_data), } @@ -2945,10 +2969,9 @@ mod tests { let arr_data = arr.to_data(); - let write_options = IpcWriteOptions { - batch_compression_type: None, - ..IpcWriteOptions::default() - }; + let write_options = IpcWriteOptions::default() + .try_with_compression(None) + .unwrap(); let compute_size = get_encoded_arr_batch_size([&arr_data], &write_options).unwrap(); let num_rows = arr_data.len(); @@ -2992,5 +3015,19 @@ mod tests { let list = FixedSizeListArray::new(list_field, 2, make_array(int_arr.to_data()), None); encode_test(list); + + let vals: Vec> = vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; + let repeats: Vec = vec![3, 4, 1, 2]; + let mut input_array: Vec> = Vec::with_capacity(80); + for ix in 0_usize..32 { + let repeat: usize = repeats[ix % repeats.len()]; + let val: Option = vals[ix % vals.len()]; + input_array.resize(input_array.len() + repeat, val); + } + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array); + let run_array = builder.finish(); + encode_test(run_array); } } From 1a4764f8034289d07968509fd5cc2f0ce81fe1f3 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Fri, 8 Nov 2024 15:23:02 -0700 Subject: [PATCH 17/20] Un-pub reencode_offsets --- arrow-ipc/src/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 4970a45fe470..066f83e88eb2 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1347,7 +1347,7 @@ pub fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, /// /// Will panic if you call this on an `ArrayData` that does not have a buffer of offsets as the /// very first buffer (i.e. expects this to be a valid variable-length array) -pub fn reencode_offsets(data: &ArrayData) -> (Buffer, usize, usize) { +fn reencode_offsets(data: &ArrayData) -> (Buffer, usize, usize) { // first we want to see: what is the offset of this `ArrayData` into the buffer (which is a // buffer of offsets into the buffer of data) let orig_offset = data.offset(); From ae37b13e2e6cba462789c87d961436ad24c9919e Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 12:10:53 -0700 Subject: [PATCH 18/20] Add doc-comment about assumptions in get_slice_memory_size_with_alignment fn --- arrow-data/src/data.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 32547c8750fa..e3a85d8f1564 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -435,7 +435,8 @@ impl ArrayData { /// would return `20 * 8 = 160`. /// /// The `alignment` parameter is used to add padding to each buffer being counted, to ensure - /// the size for each one is aligned to `alignment` bytes (if it is `Some`) + /// the size for each one is aligned to `alignment` bytes (if it is `Some`). This function + /// assumes that `alignment` is a power of 2. pub fn get_slice_memory_size_with_alignment( &self, alignment: Option, @@ -449,7 +450,7 @@ impl ArrayData { // Just pulled from arrow-ipc #[inline] fn pad_to_alignment(alignment: u8, len: usize) -> usize { - let a = usize::from(alignment - 1); + let a = usize::from(alignment.saturating_sub(1)); ((len + a) & !a) - len } From e344aa48bbb375b3da94c42714162d8c0b3e1940 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 14:24:32 -0700 Subject: [PATCH 19/20] Clean up a bit more to make diff easier --- arrow-flight/src/utils.rs | 10 +--- arrow-ipc/src/writer.rs | 116 +++++++++++++++++--------------------- 2 files changed, 55 insertions(+), 71 deletions(-) diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 9642e75fd0e1..92d0dcfb809b 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -166,16 +166,12 @@ pub fn batches_to_flight_data( let mut flight_data = vec![]; let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = + let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); for batch in batches.iter() { - let (encoded_dictionaries, encoded_batch) = data_gen.encoded_batch_with_size( - batch, - &mut dictionary_tracker, - &options, - usize::MAX, - )?; + let (encoded_dictionaries, encoded_batch) = + data_gen.encoded_batch_with_size(batch, &mut dict_tracker, &options, usize::MAX)?; dictionaries.extend(encoded_dictionaries.into_iter().map(Into::into)); flight_data.extend(encoded_batch.into_iter().map(Into::into)); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 066f83e88eb2..940fd44aae51 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -39,11 +39,10 @@ use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{layout, ArrayData, ArrayDataBuilder, BufferSpec}; use arrow_schema::*; -use crate::compression::CompressionCodec; -use crate::convert::IpcSchemaEncoder; use crate::{ - BodyCompressionBuilder, BodyCompressionMethod, DictionaryBatchBuilder, MessageBuilder, - MessageHeader, RecordBatchBuilder, CONTINUATION_MARKER, + compression::CompressionCodec, convert::IpcSchemaEncoder, BodyCompressionBuilder, + BodyCompressionMethod, DictionaryBatchBuilder, MessageBuilder, MessageHeader, RecordBatchArgs, + CONTINUATION_MARKER, }; /// IPC write options used to control the behaviour of the [`IpcDataGenerator`] @@ -1314,32 +1313,6 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } -/// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` -/// -/// In particular, this handles re-encoding the offsets if they don't start at `0`, -/// slicing the values buffer as appropriate. This helps reduce the encoded -/// size of sliced arrays, as values that have been sliced away are not encoded -/// -/// # Panics -/// -/// Panics if self.buffers does not contain at least 2 buffers (this code expects that the -/// first will contain the offsets for this variable-length array and the other will contain -/// the values) -pub fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { - if data.is_empty() { - return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); - } - - // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice - // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice - // to only get the last offset, they'll be shifted to be [0, 2], since that would be the - // offset pair for the last value in this shifted slice). - // also, in this example, original_start_offset would be 5 and len would be 2. - let (offsets, original_start_offset, len) = reencode_offsets::(data); - let values = data.buffers()[1].slice_with_length(original_start_offset, len); - (offsets, values) -} - /// Common functionality for re-encoding offsets. Returns the new offsets as well as /// original start offset and length for use in slicing child data. /// @@ -1381,6 +1354,32 @@ fn reencode_offsets(data: &ArrayData) -> (Buffer, usize, usi (offsets, start_offset, end_offset - start_offset) } +/// Returns the offsets and values [`Buffer`]s for a ByteArray with offset type `O` +/// +/// In particular, this handles re-encoding the offsets if they don't start at `0`, +/// slicing the values buffer as appropriate. This helps reduce the encoded +/// size of sliced arrays, as values that have been sliced away are not encoded +/// +/// # Panics +/// +/// Panics if self.buffers does not contain at least 2 buffers (this code expects that the +/// first will contain the offsets for this variable-length array and the other will contain +/// the values) +pub fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { + if data.is_empty() { + return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); + } + + // get the buffer of offsets, now shifted so they are shifted to be accurate to the slice + // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 7], but we slice + // to only get the last offset, they'll be shifted to be [0, 2], since that would be the + // offset pair for the last value in this shifted slice). + // also, in this example, original_start_offset would be 5 and len would be 2. + let (offsets, original_start_offset, len) = reencode_offsets::(data); + let values = data.buffers()[1].slice_with_length(original_start_offset, len); + (offsets, values) +} + /// Similar logic as [`get_byte_array_buffers()`] but slices the child array instead /// of a values buffer. fn get_list_array_buffers(data: &ArrayData) -> (Buffer, ArrayData) { @@ -1674,34 +1673,27 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { let variadic_buffer = (!variadic_buffer_counts.is_empty()) .then(|| self.fbb.create_vector(&variadic_buffer_counts)); - let root = { - let mut builder = RecordBatchBuilder::new(&mut self.fbb); - - builder.add_length(n_rows); - builder.add_nodes(nodes); - builder.add_buffers(buffers); - if let Some(c) = compression { - builder.add_compression(c); - } - if let Some(v) = variadic_buffer { - builder.add_variadicBufferCounts(v); - } - - builder.finish() - }; + let root = crate::RecordBatch::create( + &mut self.fbb, + &RecordBatchArgs { + length: n_rows, + nodes: Some(nodes), + buffers: Some(buffers), + compression, + variadicBufferCounts: variadic_buffer, + }, + ); let root = encode_root(self, root); let arrow_len = self.arrow_data.len() as i64; - let msg = { - let mut builder = MessageBuilder::new(&mut self.fbb); - builder.add_version(write_options.metadata_version); - builder.add_header_type(header_type); - builder.add_bodyLength(arrow_len); - builder.add_header(root); - builder.finish() - }; + let mut msg_bldr = MessageBuilder::new(&mut self.fbb); + msg_bldr.add_version(write_options.metadata_version); + msg_bldr.add_header_type(header_type); + msg_bldr.add_header(root); + msg_bldr.add_bodyLength(arrow_len); + let msg = msg_bldr.finish(); self.fbb.finish(msg, None); Ok(()) @@ -1752,10 +1744,8 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_byte_array_byffers = |(offsets, values): (Buffer, Buffer)| { - for buffer in [offsets, values] { - self.write_buffer(&buffer, offset, compression_codec, write_options.alignment)?; - } - Ok::<_, ArrowError>(()) + self.write_buffer(&offsets, offset, compression_codec, write_options.alignment)?; + self.write_buffer(&values, offset, compression_codec, write_options.alignment) }; match array_data.data_type() { @@ -1821,14 +1811,13 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { _ => unreachable!(), }; self.write_buffer(&offsets, offset, compression_codec, write_options.alignment)?; - self.write_array_data( + return self.write_array_data( &sliced_child_data, offset, sliced_child_data.len(), sliced_child_data.null_count(), write_options, - )?; - return Ok(()); + ); } _ => { // This accommodates for even the `View` types (e.g. BinaryView and Utf8View): @@ -1846,16 +1835,15 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> { } let mut write_arr = |arr: &ArrayData| { - for data_ref in arr.child_data() { + arr.child_data().iter().try_for_each(|data_ref| { self.write_array_data( data_ref, offset, data_ref.len(), data_ref.null_count(), write_options, - )?; - } - Ok::<_, ArrowError>(()) + ) + }) }; match array_data.data_type() { From 652aa2753204a7b25ffcdd324449791679edf0ad Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Thu, 14 Nov 2024 10:20:14 -0700 Subject: [PATCH 20/20] Change max size variable to be more specifically accurate --- arrow-ipc/src/writer.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 940fd44aae51..181baae3a5f4 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -478,19 +478,19 @@ impl IpcDataGenerator { /// Encodes a batch to a number of [EncodedData] items (dictionary batches + the record batch). /// The [DictionaryTracker] keeps track of dictionaries with new `dict_id`s (so they are only sent once) /// Make sure the [DictionaryTracker] is initialized at the start of the stream. - /// The `max_flight_data_size` is used to control how much space each encoded [`RecordBatch`] is + /// The `max_encoded_data_size` is used to control how much space each encoded [`RecordBatch`] is /// allowed to take up. /// /// Each [`EncodedData`] in the second element of the returned tuple will be smaller than - /// `max_flight_data_size` bytes, if possible at all. However, this API has no support for + /// `max_encoded_data_size` bytes, if possible at all. However, this API has no support for /// splitting rows into multiple [`EncodedData`]s, so if a row is larger, by itself, than - /// `max_flight_data_size`, it will be encoded to a message which is larger than the limit. + /// `max_encoded_data_size`, it will be encoded to a message which is larger than the limit. pub fn encoded_batch_with_size( &self, batch: &RecordBatch, dictionary_tracker: &mut DictionaryTracker, write_options: &IpcWriteOptions, - max_flight_data_size: usize, + max_encoded_data_size: usize, ) -> Result<(Vec, Vec), ArrowError> { let schema = batch.schema(); let mut encoded_dictionaries = Vec::with_capacity(schema.flattened_fields().len()); @@ -510,7 +510,7 @@ impl IpcDataGenerator { } let encoded_message = - chunked_encoded_batch_bytes(batch, write_options, max_flight_data_size)?; + chunked_encoded_batch_bytes(batch, write_options, max_encoded_data_size)?; Ok((encoded_dictionaries, encoded_message)) } @@ -1408,7 +1408,7 @@ fn pad_to_alignment(alignment: u8, len: usize) -> usize { fn chunked_encoded_batch_bytes( batch: &RecordBatch, write_options: &IpcWriteOptions, - max_flight_data_size: usize, + max_encoded_data_size: usize, ) -> Result, ArrowError> { encode_array_datas( &batch @@ -1419,7 +1419,7 @@ fn chunked_encoded_batch_bytes( batch.num_rows(), |_, offset| offset.as_union_value(), MessageHeader::RecordBatch, - max_flight_data_size, + max_encoded_data_size, write_options, ) }