diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c4..a03dd25d5fab 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -555,7 +555,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// of the current memory usage and not the final anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn memory_size(&self) -> usize { - self.column_metrics.total_bytes_written as usize + self.encoder.estimated_memory_size() + self.data_pages + .iter() + .map(|page| page.data().len()) + .sum::() + + self.column_metrics.total_bytes_written as usize + + self.encoder.estimated_memory_size() } /// Returns total number of bytes written by this column writer so far. @@ -3422,6 +3427,25 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + fn test_column_writer_memory_size() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.memory_size(), 256); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.memory_size(); + assert_eq!(size_with_one_page, 256 + 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.memory_size(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 256 + 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]],