From 69b17ad40294003b9dfd25d85f2523abd1a0034e Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Thu, 15 Aug 2024 12:00:55 -0500 Subject: [PATCH] parquet Statistics - deprecate `has_*` APIs and add `_opt` functions that return `Option` (#6216) * update public api Statistics::min to return an option. I first re-named the existing method to `min_unchecked` and made it internal to the crate. I then added a `pub min(&self) -> Opiton<&T>` method. I figure we can first change the public API before deciding what to do about internal usage. Ref: https://github.com/apache/arrow-rs/issues/6093 * update public api Statistics::max to return an option. I first re-named the existing method to `max_unchecked` and made it internal to the crate. I then added a `pub max(&self) -> Opiton<&T>` method. I figure we can first change the public API before deciding what to do about internal usage. Ref: https://github.com/apache/arrow-rs/issues/6093 * cargo fmt * remove Statistics::has_min_max_set from the public api Ref: https://github.com/apache/arrow-rs/issues/6093 * update impl HeapSize for ValueStatistics to use new min and max api * migrate all tests to new Statistics min and max api * make Statistics::null_count return Option This removes ambiguity around whether the between all values are non-null or just that the null count stat is missing Ref: https://github.com/apache/arrow-rs/issues/6215 * update expected metadata memory size tests Changing null_count from u64 to Option increases the memory size and layout of the metadata. I included these tests as a separate commit to call extra attention to it. * add TODO question on is_min_max_backwards_compatible * Apply suggestions from code review Co-authored-by: Andrew Lamb * update ValueStatistics::max docs * rename new optional ValueStatistics::max to max_opt Per PR review, we will deprecate the old API instead of introducing a brekaing change. Ref: https://github.com/apache/arrow-rs/pull/6216#pullrequestreview-2236537291 * rename new optional ValueStatistics::min to min_opt * add Statistics:{min,max}_bytes_opt This adds the API and migrates all of the test usage. The old APIs will be deprecated next. * update make_stats_iterator macro to use *_opt methods * deprecate non *_opt Statistics and ValueStatistics methods * remove stale TODO comments * remove has_min_max_set check from make_decimal_stats_iterator The check is unnecessary now that the stats funcs return Option when unset. * deprecate has_min_max_set An internal version was also created because it is used so extensively in testing. * switch to null_count_opt and reintroduce deprecated null_count and has_nulls * remove redundant test assertions of stats._internal_has_min_max_set This removes the assertion from any test that subsequently unwraps both min_opt and max_opt. * replace negated test assertions of stats._internal_has_mix_max_set with assertions on min_opt and max_opt This removes all use of Statistics::_internal_has_min_max_set from the code base, and so it is also removed. * Revert changes to parquet writing, update comments --------- Co-authored-by: Andrew Lamb --- parquet/src/arrow/arrow_reader/statistics.rs | 109 ++++-- parquet/src/arrow/arrow_writer/mod.rs | 32 +- parquet/src/column/page.rs | 10 +- parquet/src/column/writer/mod.rs | 322 +++++++++--------- parquet/src/file/metadata/memory.rs | 10 +- parquet/src/file/metadata/mod.rs | 14 +- parquet/src/file/serialized_reader.rs | 5 +- parquet/src/file/statistics.rs | 332 ++++++++++++++----- parquet/src/file/writer.rs | 6 +- 9 files changed, 521 insertions(+), 319 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index d487967c23b3..602a9ad5e506 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -116,7 +116,7 @@ macro_rules! make_stats_iterator { let next = self.iter.next(); next.map(|x| { x.and_then(|stats| match stats { - $parquet_statistics_type(s) if stats.has_min_max_set() => Some(s.$func()), + $parquet_statistics_type(s) => s.$func(), _ => None, }) }) @@ -131,45 +131,85 @@ macro_rules! make_stats_iterator { make_stats_iterator!( MinBooleanStatsIterator, - min, + min_opt, ParquetStatistics::Boolean, bool ); make_stats_iterator!( MaxBooleanStatsIterator, - max, + max_opt, ParquetStatistics::Boolean, bool ); -make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32); -make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); -make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64); -make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); -make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32); -make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); -make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64); -make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); +make_stats_iterator!( + MinInt32StatsIterator, + min_opt, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MaxInt32StatsIterator, + max_opt, + ParquetStatistics::Int32, + i32 +); +make_stats_iterator!( + MinInt64StatsIterator, + min_opt, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MaxInt64StatsIterator, + max_opt, + ParquetStatistics::Int64, + i64 +); +make_stats_iterator!( + MinFloatStatsIterator, + min_opt, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MaxFloatStatsIterator, + max_opt, + ParquetStatistics::Float, + f32 +); +make_stats_iterator!( + MinDoubleStatsIterator, + min_opt, + ParquetStatistics::Double, + f64 +); +make_stats_iterator!( + MaxDoubleStatsIterator, + max_opt, + ParquetStatistics::Double, + f64 +); make_stats_iterator!( MinByteArrayStatsIterator, - min_bytes, + min_bytes_opt, ParquetStatistics::ByteArray, [u8] ); make_stats_iterator!( MaxByteArrayStatsIterator, - max_bytes, + max_bytes_opt, ParquetStatistics::ByteArray, [u8] ); make_stats_iterator!( MinFixedLenByteArrayStatsIterator, - min_bytes, + min_bytes_opt, ParquetStatistics::FixedLenByteArray, [u8] ); make_stats_iterator!( MaxFixedLenByteArrayStatsIterator, - max_bytes, + max_bytes_opt, ParquetStatistics::FixedLenByteArray, [u8] ); @@ -218,19 +258,18 @@ macro_rules! make_decimal_stats_iterator { fn next(&mut self) -> Option { let next = self.iter.next(); next.map(|x| { - x.and_then(|stats| { - if !stats.has_min_max_set() { - return None; + x.and_then(|stats| match stats { + ParquetStatistics::Int32(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) } - match stats { - ParquetStatistics::Int32(s) => Some($stat_value_type::from(*s.$func())), - ParquetStatistics::Int64(s) => Some($stat_value_type::from(*s.$func())), - ParquetStatistics::ByteArray(s) => Some($convert_func(s.$bytes_func())), - ParquetStatistics::FixedLenByteArray(s) => { - Some($convert_func(s.$bytes_func())) - } - _ => None, + ParquetStatistics::Int64(s) => { + s.$func().map(|x| $stat_value_type::from(*x)) } + ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func), + ParquetStatistics::FixedLenByteArray(s) => { + s.$bytes_func().map($convert_func) + } + _ => None, }) }) } @@ -244,29 +283,29 @@ macro_rules! make_decimal_stats_iterator { make_decimal_stats_iterator!( MinDecimal128StatsIterator, - min, - min_bytes, + min_opt, + min_bytes_opt, i128, from_bytes_to_i128 ); make_decimal_stats_iterator!( MaxDecimal128StatsIterator, - max, - max_bytes, + max_opt, + max_bytes_opt, i128, from_bytes_to_i128 ); make_decimal_stats_iterator!( MinDecimal256StatsIterator, - min, - min_bytes, + min_opt, + min_bytes_opt, i256, from_bytes_to_i256 ); make_decimal_stats_iterator!( MaxDecimal256StatsIterator, - max, - max_bytes, + max_opt, + max_bytes_opt, i256, from_bytes_to_i256 ); @@ -1347,7 +1386,7 @@ impl<'a> StatisticsConverter<'a> { let null_counts = metadatas .into_iter() .map(|x| x.column(parquet_index).statistics()) - .map(|s| s.map(|s| s.null_count())); + .map(|s| s.and_then(|s| s.null_count_opt())); Ok(UInt64Array::from_iter(null_counts)) } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0c07f541bd8a..5c5f28ac0f72 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2541,10 +2541,15 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(*stats.min() as u32, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max() as u32, *src_slice.iter().max().unwrap()); + assert_eq!( + *stats.min_opt().unwrap() as u32, + *src_slice.iter().min().unwrap() + ); + assert_eq!( + *stats.max_opt().unwrap() as u32, + *src_slice.iter().max().unwrap() + ); } else { panic!("Statistics::Int32 missing") } @@ -2582,10 +2587,15 @@ mod tests { row_offset += column.num_values() as usize; let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); if let Statistics::Int64(stats) = stats { - assert_eq!(*stats.min() as u64, *src_slice.iter().min().unwrap()); - assert_eq!(*stats.max() as u64, *src_slice.iter().max().unwrap()); + assert_eq!( + *stats.min_opt().unwrap() as u64, + *src_slice.iter().min().unwrap() + ); + assert_eq!( + *stats.max_opt().unwrap() as u64, + *src_slice.iter().max().unwrap() + ); } else { panic!("Statistics::Int64 missing") } @@ -2608,7 +2618,7 @@ mod tests { assert_eq!(row_group.num_columns(), 1); let column = row_group.column(0); let stats = column.statistics().unwrap(); - assert_eq!(stats.null_count(), 2); + assert_eq!(stats.null_count_opt(), Some(2)); } } @@ -3069,8 +3079,8 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min(); - let max = byte_array_stats.max(); + let min = byte_array_stats.min_opt().unwrap(); + let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); @@ -3141,8 +3151,8 @@ mod tests { // Column chunk of column "a" should have chunk level statistics if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { - let min = byte_array_stats.min(); - let max = byte_array_stats.max(); + let min = byte_array_stats.min_opt().unwrap(); + let max = byte_array_stats.max_opt().unwrap(); assert_eq!(min.as_bytes(), &[b'a']); assert_eq!(max.as_bytes(), &[b'd']); diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 585d1951c14a..e3931dfe9e2b 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -370,7 +370,7 @@ mod tests { encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; assert_eq!(data_page.page_type(), PageType::DATA_PAGE); assert_eq!(data_page.buffer(), vec![0, 1, 2].as_slice()); @@ -378,7 +378,7 @@ mod tests { assert_eq!(data_page.encoding(), Encoding::PLAIN); assert_eq!( data_page.statistics(), - Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true)) ); let data_page_v2 = Page::DataPageV2 { @@ -390,7 +390,7 @@ mod tests { def_levels_byte_len: 30, rep_levels_byte_len: 40, is_compressed: false, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2); assert_eq!(data_page_v2.buffer(), vec![0, 1, 2].as_slice()); @@ -398,7 +398,7 @@ mod tests { assert_eq!(data_page_v2.encoding(), Encoding::PLAIN); assert_eq!( data_page_v2.statistics(), - Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true)) ); let dict_page = Page::DictionaryPage { @@ -422,7 +422,7 @@ mod tests { encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)), }; let cpage = CompressedPage::new(data_page, 5); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 519a219f943d..8ea2878317e8 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -769,8 +769,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } Some(stat) => { // Check if min/max are still ascending/descending across pages - let new_min = stat.min(); - let new_max = stat.max(); + let new_min = stat.min_opt().unwrap(); + let new_max = stat.max_opt().unwrap(); if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { if self.data_page_boundary_ascending { // If last min/max are greater than new min/max then not ascending anymore @@ -797,12 +797,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, self.truncate_min_value( self.props.column_index_truncate_length(), - stat.min_bytes(), + stat.min_bytes_opt().unwrap(), ) .0, self.truncate_max_value( self.props.column_index_truncate_length(), - stat.max_bytes(), + stat.max_bytes_opt().unwrap(), ) .0, self.page_metrics.num_page_nulls as i64, @@ -810,8 +810,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } else { self.column_index_builder.append( null_page, - stat.min_bytes().to_vec(), - stat.max_bytes().to_vec(), + stat.min_bytes_opt().unwrap().to_vec(), + stat.max_bytes_opt().unwrap().to_vec(), self.page_metrics.num_page_nulls as i64, ); } @@ -897,7 +897,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(min), Some(max), None, - self.page_metrics.num_page_nulls, + Some(self.page_metrics.num_page_nulls), false, ), ) @@ -1066,28 +1066,28 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.min_column_value.clone(), self.column_metrics.max_column_value.clone(), self.column_metrics.column_distinct_count, - self.column_metrics.num_column_nulls, + Some(self.column_metrics.num_column_nulls), false, ) .with_backwards_compatible_min_max(backwards_compatible_min_max) .into(); let statistics = match statistics { - Statistics::ByteArray(stats) if stats.has_min_max_set() => { + Statistics::ByteArray(stats) if stats._internal_has_min_max_set() => { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), - stats.min_bytes(), + stats.min_bytes_opt().unwrap(), ); let (max, did_truncate_max) = self.truncate_max_value( self.props.statistics_truncate_length(), - stats.max_bytes(), + stats.max_bytes_opt().unwrap(), ); Statistics::ByteArray( ValueStatistics::new( Some(min.into()), Some(max.into()), stats.distinct_count(), - stats.null_count(), + stats.null_count_opt(), backwards_compatible_min_max, ) .with_max_is_exact(!did_truncate_max) @@ -1095,22 +1095,22 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ) } Statistics::FixedLenByteArray(stats) - if (stats.has_min_max_set() && self.can_truncate_value()) => + if (stats._internal_has_min_max_set() && self.can_truncate_value()) => { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), - stats.min_bytes(), + stats.min_bytes_opt().unwrap(), ); let (max, did_truncate_max) = self.truncate_max_value( self.props.statistics_truncate_length(), - stats.max_bytes(), + stats.max_bytes_opt().unwrap(), ); Statistics::FixedLenByteArray( ValueStatistics::new( Some(min.into()), Some(max.into()), stats.distinct_count(), - stats.null_count(), + stats.null_count_opt(), backwards_compatible_min_max, ) .with_max_is_exact(!did_truncate_max) @@ -1841,12 +1841,11 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &1); - assert_eq!(stats.max(), &4); + assert_eq!(stats.min_opt().unwrap(), &1); + assert_eq!(stats.max_opt().unwrap(), &4); } else { panic!("expecting Statistics::Int32"); } @@ -1886,17 +1885,16 @@ mod tests { .unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!( - stats.min(), + stats.min_opt().unwrap(), &ByteArray::from(vec![ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, ]) ); assert_eq!( - stats.max(), + stats.max_opt().unwrap(), &ByteArray::from(vec![ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, 44u8, 0u8, 0u8, @@ -1923,10 +1921,9 @@ mod tests { writer.write_batch(&[0, 1, 2, 3, 4, 5], None, None).unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &0,); - assert_eq!(stats.max(), &5,); + assert_eq!(stats.min_opt().unwrap(), &0,); + assert_eq!(stats.max_opt().unwrap(), &5,); } else { panic!("expecting Statistics::Int32"); } @@ -1970,12 +1967,11 @@ mod tests { assert_eq!(metadata.data_page_offset(), 0); assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &-17); - assert_eq!(stats.max(), &9000); + assert_eq!(stats.min_opt().unwrap(), &-17); + assert_eq!(stats.max_opt().unwrap(), &9000); } else { panic!("expecting Statistics::Int32"); } @@ -2000,9 +1996,9 @@ mod tests { let r = writer.close().unwrap(); let stats = r.metadata.statistics().unwrap(); - assert_eq!(stats.min_bytes(), 1_i32.to_le_bytes()); - assert_eq!(stats.max_bytes(), 7_i32.to_le_bytes()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.min_bytes_opt().unwrap(), 1_i32.to_le_bytes()); + assert_eq!(stats.max_bytes_opt().unwrap(), 7_i32.to_le_bytes()); + assert_eq!(stats.null_count_opt(), Some(0)); assert!(stats.distinct_count().is_none()); drop(write); @@ -2026,9 +2022,15 @@ mod tests { assert_eq!(pages[1].page_type(), PageType::DATA_PAGE); let page_statistics = pages[1].statistics().unwrap(); - assert_eq!(page_statistics.min_bytes(), 1_i32.to_le_bytes()); - assert_eq!(page_statistics.max_bytes(), 7_i32.to_le_bytes()); - assert_eq!(page_statistics.null_count(), 0); + assert_eq!( + page_statistics.min_bytes_opt().unwrap(), + 1_i32.to_le_bytes() + ); + assert_eq!( + page_statistics.max_bytes_opt().unwrap(), + 7_i32.to_le_bytes() + ); + assert_eq!(page_statistics.null_count_opt(), Some(0)); assert!(page_statistics.distinct_count().is_none()); } @@ -2217,13 +2219,12 @@ mod tests { #[test] fn test_bool_statistics() { let stats = statistics_roundtrip::(&[true, false, false, true]); - assert!(stats.has_min_max_set()); // Booleans have an unsigned sort order and so are not compatible // with the deprecated `min` and `max` statistics assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { - assert_eq!(stats.min(), &false); - assert_eq!(stats.max(), &true); + assert_eq!(stats.min_opt().unwrap(), &false); + assert_eq!(stats.max_opt().unwrap(), &true); } else { panic!("expecting Statistics::Boolean, got {stats:?}"); } @@ -2232,11 +2233,10 @@ mod tests { #[test] fn test_int32_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &-2); - assert_eq!(stats.max(), &3); + assert_eq!(stats.min_opt().unwrap(), &-2); + assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int32, got {stats:?}"); } @@ -2245,11 +2245,10 @@ mod tests { #[test] fn test_int64_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { - assert_eq!(stats.min(), &-2); - assert_eq!(stats.max(), &3); + assert_eq!(stats.min_opt().unwrap(), &-2); + assert_eq!(stats.max_opt().unwrap(), &3); } else { panic!("expecting Statistics::Int64, got {stats:?}"); } @@ -2267,11 +2266,10 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats.has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { - assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); - assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); + assert_eq!(stats.min_opt().unwrap(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.max_opt().unwrap(), &Int96::from(vec![3, 20, 10])); } else { panic!("expecting Statistics::Int96, got {stats:?}"); } @@ -2280,11 +2278,10 @@ mod tests { #[test] fn test_float_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-2.0); - assert_eq!(stats.max(), &3.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); + assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Float, got {stats:?}"); } @@ -2293,11 +2290,10 @@ mod tests { #[test] fn test_double_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-2.0); - assert_eq!(stats.max(), &3.0); + assert_eq!(stats.min_opt().unwrap(), &-2.0); + assert_eq!(stats.max_opt().unwrap(), &3.0); } else { panic!("expecting Statistics::Double, got {stats:?}"); } @@ -2312,10 +2308,9 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(!stats.is_min_max_backwards_compatible()); - assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { - assert_eq!(stats.min(), &ByteArray::from("aaw")); - assert_eq!(stats.max(), &ByteArray::from("zz")); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from("aaw")); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from("zz")); } else { panic!("expecting Statistics::ByteArray, got {stats:?}"); } @@ -2329,13 +2324,12 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); - assert!(stats.has_min_max_set()); assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); - assert_eq!(stats.min(), &expected_min); + assert_eq!(stats.min_opt().unwrap(), &expected_min); let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); - assert_eq!(stats.max(), &expected_max); + assert_eq!(stats.max_opt().unwrap(), &expected_max); } else { panic!("expecting Statistics::FixedLenByteArray, got {stats:?}"); } @@ -2354,10 +2348,15 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(-f16::from_f32(2.0))); - assert_eq!(stats.max(), &ByteArray::from(f16::from_f32(3.0))); + assert_eq!( + stats.min_opt().unwrap(), + &ByteArray::from(-f16::from_f32(2.0)) + ); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::from_f32(3.0)) + ); } #[test] @@ -2368,10 +2367,12 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2382,10 +2383,12 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2396,10 +2399,12 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); - assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::ONE)); + assert_eq!( + stats.max_opt().unwrap(), + &ByteArray::from(f16::ONE + f16::ONE) + ); } #[test] @@ -2410,7 +2415,8 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(!stats.has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); } @@ -2422,10 +2428,9 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2436,10 +2441,9 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] @@ -2450,10 +2454,9 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); - assert_eq!(stats.max(), &ByteArray::from(f16::PI)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::PI)); } #[test] @@ -2464,20 +2467,18 @@ mod tests { .collect::>(); let stats = float16_statistics_roundtrip(&input); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); - assert_eq!(stats.min(), &ByteArray::from(-f16::PI)); - assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + assert_eq!(stats.min_opt().unwrap(), &ByteArray::from(-f16::PI)); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::ZERO)); } #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2486,11 +2487,10 @@ mod tests { #[test] fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2499,7 +2499,8 @@ mod tests { #[test] fn test_float_statistics_nan_only() { let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); - assert!(!stats.has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); assert!(matches!(stats, Statistics::Float(_))); } @@ -2507,13 +2508,12 @@ mod tests { #[test] fn test_float_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2522,13 +2522,12 @@ mod tests { #[test] fn test_float_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2537,12 +2536,11 @@ mod tests { #[test] fn test_float_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } @@ -2551,12 +2549,11 @@ mod tests { #[test] fn test_float_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { - assert_eq!(stats.min(), &-2.0); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-2.0); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Float"); } @@ -2565,11 +2562,10 @@ mod tests { #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2578,11 +2574,10 @@ mod tests { #[test] fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &1.0); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &1.0); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2591,7 +2586,8 @@ mod tests { #[test] fn test_double_statistics_nan_only() { let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); - assert!(!stats.has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); assert!(matches!(stats, Statistics::Double(_))); assert!(stats.is_min_max_backwards_compatible()); } @@ -2599,13 +2595,12 @@ mod tests { #[test] fn test_double_statistics_zero_only() { let stats = statistics_roundtrip::(&[0.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2614,13 +2609,12 @@ mod tests { #[test] fn test_double_statistics_neg_zero_only() { let stats = statistics_roundtrip::(&[-0.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2629,12 +2623,11 @@ mod tests { #[test] fn test_double_statistics_zero_min() { let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-0.0); - assert!(stats.min().is_sign_negative()); - assert_eq!(stats.max(), &2.0); + assert_eq!(stats.min_opt().unwrap(), &-0.0); + assert!(stats.min_opt().unwrap().is_sign_negative()); + assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } @@ -2643,12 +2636,11 @@ mod tests { #[test] fn test_double_statistics_neg_zero_max() { let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); - assert!(stats.has_min_max_set()); assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { - assert_eq!(stats.min(), &-2.0); - assert_eq!(stats.max(), &0.0); - assert!(stats.max().is_sign_positive()); + assert_eq!(stats.min_opt().unwrap(), &-2.0); + assert_eq!(stats.max_opt().unwrap(), &0.0); + assert!(stats.max_opt().unwrap().is_sign_positive()); } else { panic!("expecting Statistics::Double"); } @@ -2705,15 +2697,20 @@ mod tests { } if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::Int32(stats) = stats { // first page is [1,2,3,4] // second page is [-5,2,4,8] // note that we don't increment here, as this is a non BinaryArray type. - assert_eq!(stats.min_bytes(), column_index.min_values[1].as_slice()); - assert_eq!(stats.max_bytes(), column_index.max_values.get(1).unwrap()); + assert_eq!( + stats.min_bytes_opt(), + Some(column_index.min_values[1].as_slice()) + ); + assert_eq!( + stats.max_bytes_opt(), + column_index.max_values.get(1).map(Vec::as_slice) + ); } else { panic!("expecting Statistics::Int32"); } @@ -2760,16 +2757,21 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(stats) = stats { let column_index_min_value = &column_index.min_values[0]; let column_index_max_value = &column_index.max_values[0]; // Column index stats are truncated, while the column chunk's aren't. - assert_ne!(stats.min_bytes(), column_index_min_value.as_slice()); - assert_ne!(stats.max_bytes(), column_index_max_value.as_slice()); + assert_ne!( + stats.min_bytes_opt(), + Some(column_index_min_value.as_slice()) + ); + assert_ne!( + stats.max_bytes_opt(), + Some(column_index_max_value.as_slice()) + ); assert_eq!( column_index_min_value.len(), @@ -2827,8 +2829,7 @@ mod tests { assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); if let Some(stats) = r.metadata.statistics() { - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let column_index_min_value = &column_index.min_values[0]; @@ -2840,8 +2841,8 @@ mod tests { assert_eq!("B".as_bytes(), column_index_min_value.as_slice()); assert_eq!("C".as_bytes(), column_index_max_value.as_slice()); - assert_ne!(column_index_min_value, stats.min_bytes()); - assert_ne!(column_index_max_value, stats.max_bytes()); + assert_ne!(column_index_min_value, stats.min_bytes_opt().unwrap()); + assert_ne!(column_index_max_value, stats.max_bytes_opt().unwrap()); } else { panic!("expecting Statistics::FixedLenByteArray"); } @@ -2875,10 +2876,9 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats.has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { - let stats_min_bytes = stats.min_bytes(); - let stats_max_bytes = stats.max_bytes(); + let stats_min_bytes = stats.min_bytes_opt().unwrap(); + let stats_max_bytes = stats.max_bytes_opt().unwrap(); assert_eq!(expected_value, stats_min_bytes); assert_eq!(expected_value, stats_max_bytes); } else { @@ -2915,10 +2915,9 @@ mod tests { // ensure bytes weren't truncated for statistics let stats = r.metadata.statistics().unwrap(); - assert!(stats.has_min_max_set()); if let Statistics::FixedLenByteArray(stats) = stats { - let stats_min_bytes = stats.min_bytes(); - let stats_max_bytes = stats.max_bytes(); + let stats_min_bytes = stats.min_bytes_opt().unwrap(); + let stats_max_bytes = stats.max_bytes_opt().unwrap(); assert_eq!(expected_value, stats_min_bytes); assert_eq!(expected_value, stats_max_bytes); } else { @@ -2951,12 +2950,11 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::ByteArray(_stats) = stats { - let min_value = _stats.min(); - let max_value = _stats.max(); + let min_value = _stats.min_opt().unwrap(); + let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); @@ -3004,12 +3002,11 @@ mod tests { assert_eq!(1, r.rows_written); let stats = r.metadata.statistics().expect("statistics"); - assert!(stats.has_min_max_set()); - assert_eq!(stats.null_count(), 0); + assert_eq!(stats.null_count_opt(), Some(0)); assert_eq!(stats.distinct_count(), None); if let Statistics::FixedLenByteArray(_stats) = stats { - let min_value = _stats.min(); - let max_value = _stats.max(); + let min_value = _stats.min_opt().unwrap(); + let max_value = _stats.max_opt().unwrap(); assert!(!_stats.min_is_exact()); assert!(!_stats.max_is_exact()); @@ -3326,7 +3323,8 @@ mod tests { } else { panic!("metadata missing statistics"); }; - assert!(!stats.has_min_max_set()); + assert!(stats.min_bytes_opt().is_none()); + assert!(stats.max_bytes_opt().is_none()); } fn write_multiple_pages( diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index bb822b4ccbe7..ad452267901a 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -183,14 +183,8 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { - if self.has_min_max_set() { - return self.min().heap_size() + self.max().heap_size(); - } else if self.min_is_exact() { - return self.min().heap_size(); - } else if self.max_is_exact() { - return self.max().heap_size(); - } - 0 + self.min_opt().map(T::heap_size).unwrap_or(0) + + self.max_opt().map(T::heap_size).unwrap_or(0) } } impl HeapSize for bool { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index a18128cfd5ef..f3c73ee81a0e 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1615,7 +1615,7 @@ mod tests { .iter() .map(|column_descr| { ColumnChunkMetaData::builder(column_descr.clone()) - .set_statistics(Statistics::new::(None, None, None, 0, false)) + .set_statistics(Statistics::new::(None, None, None, None, false)) .build() }) .collect::>>() @@ -1653,7 +1653,13 @@ mod tests { .iter() .map(|column_descr| { ColumnChunkMetaData::builder(column_descr.clone()) - .set_statistics(Statistics::new::(Some(0), Some(100), None, 0, false)) + .set_statistics(Statistics::new::( + Some(0), + Some(100), + None, + None, + false, + )) .build() }) .collect::>>() @@ -1667,7 +1673,7 @@ mod tests { let row_group_meta_with_stats = vec![row_group_meta_with_stats]; let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats); - let base_expected_size = 2280; + let base_expected_size = 2312; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1695,7 +1701,7 @@ mod tests { ]]), ); - let bigger_expected_size = 2784; + let bigger_expected_size = 2816; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 07e80f7f6998..0a3e51931867 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1544,7 +1544,10 @@ mod tests { fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) -> (&[u8], &[u8]) { let statistics = r.column(col_num).statistics().unwrap(); - (statistics.min_bytes(), statistics.max_bytes()) + ( + statistics.min_bytes_opt().unwrap_or_default(), + statistics.max_bytes_opt().unwrap_or_default(), + ) } #[test] diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index dded9c9c1192..4134685ffcfb 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -24,17 +24,16 @@ //! ```rust //! use parquet::file::statistics::Statistics; //! -//! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); -//! assert_eq!(stats.null_count(), 3); -//! assert!(stats.has_min_max_set()); +//! let stats = Statistics::int32(Some(1), Some(10), None, Some(3), true); +//! assert_eq!(stats.null_count_opt(), Some(3)); //! assert!(stats.is_min_max_deprecated()); //! assert!(stats.min_is_exact()); //! assert!(stats.max_is_exact()); //! //! match stats { //! Statistics::Int32(ref typed) => { -//! assert_eq!(*typed.min(), 1); -//! assert_eq!(*typed.max(), 10); +//! assert_eq!(typed.min_opt(), Some(&1)); +//! assert_eq!(typed.max_opt(), Some(&10)); //! } //! _ => {} //! } @@ -89,7 +88,7 @@ macro_rules! statistics_new_func { min: $vtype, max: $vtype, distinct: Option, - nulls: u64, + nulls: Option, is_deprecated: bool, ) -> Self { Statistics::$stat(ValueStatistics::new( @@ -127,6 +126,8 @@ pub fn from_thrift( Ok(match thrift_stats { Some(stats) => { // Number of nulls recorded, when it is not available, we just mark it as 0. + // TODO this should be `None` if there is no information about NULLS. + // see https://github.com/apache/arrow-rs/pull/6216/files let null_count = stats.null_count.unwrap_or(0); if null_count < 0 { @@ -137,7 +138,7 @@ pub fn from_thrift( } // Generic null count. - let null_count = null_count as u64; + let null_count = Some(null_count as u64); // Generic distinct count (count of distinct values occurring) let distinct_count = stats.distinct_count.map(|value| value as u64); // Whether or not statistics use deprecated min/max fields. @@ -243,14 +244,19 @@ pub fn from_thrift( pub fn to_thrift(stats: Option<&Statistics>) -> Option { let stats = stats?; + // record null counts if greater than zero. + // + // TODO: This should be Some(0) if there are no nulls. + // see https://github.com/apache/arrow-rs/pull/6216/files + let null_count = stats + .null_count_opt() + .map(|value| value as i64) + .filter(|&x| x > 0); + let mut thrift_stats = TStatistics { max: None, min: None, - null_count: if stats.has_nulls() { - Some(stats.null_count() as i64) - } else { - None - }, + null_count, distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, @@ -259,17 +265,12 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { }; // Get min/max if set. - let (min, max, min_exact, max_exact) = if stats.has_min_max_set() { - ( - Some(stats.min_bytes().to_vec()), - Some(stats.max_bytes().to_vec()), - Some(stats.min_is_exact()), - Some(stats.max_is_exact()), - ) - } else { - (None, None, None, None) - }; - + let (min, max, min_exact, max_exact) = ( + stats.min_bytes_opt().map(|x| x.to_vec()), + stats.max_bytes_opt().map(|x| x.to_vec()), + Some(stats.min_is_exact()), + Some(stats.max_is_exact()), + ); if stats.is_min_max_backwards_compatible() { // Copy to deprecated min, max values for compatibility with older readers thrift_stats.min.clone_from(&min); @@ -321,7 +322,7 @@ impl Statistics { min: Option, max: Option, distinct_count: Option, - null_count: u64, + null_count: Option, is_deprecated: bool, ) -> Self { Self::from(ValueStatistics::new( @@ -385,19 +386,39 @@ impl Statistics { /// Returns number of null values for the column. /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns 0 if the null count is not available. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] pub fn null_count(&self) -> u64 { - statistics_enum_func![self, null_count] + // 0 to remain consistent behavior prior to `null_count_opt` + self.null_count_opt().unwrap_or(0) } /// Returns `true` if statistics collected any null values, `false` otherwise. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] + #[allow(deprecated)] pub fn has_nulls(&self) -> bool { self.null_count() > 0 } - /// Returns `true` if min value and max value are set. + /// Returns number of null values for the column, if known. + /// Note that this includes all nulls when column is part of the complex type. + /// + /// Note this API returns Some(0) even if the null count was not present + /// in the statistics. + /// See + pub fn null_count_opt(&self) -> Option { + statistics_enum_func![self, null_count_opt] + } + + /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. + #[deprecated( + since = "53.0.0", + note = "Use `min_bytes_opt` and `max_bytes_opt` methods instead" + )] pub fn has_min_max_set(&self) -> bool { - statistics_enum_func![self, has_min_max_set] + statistics_enum_func![self, _internal_has_min_max_set] } /// Returns `true` if the min value is set, and is an exact min value. @@ -410,16 +431,28 @@ impl Statistics { statistics_enum_func![self, max_is_exact] } + /// Returns slice of bytes that represent min value, if min value is known. + pub fn min_bytes_opt(&self) -> Option<&[u8]> { + statistics_enum_func![self, min_bytes_opt] + } + /// Returns slice of bytes that represent min value. /// Panics if min value is not set. + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn min_bytes(&self) -> &[u8] { - statistics_enum_func![self, min_bytes] + self.min_bytes_opt().unwrap() + } + + /// Returns slice of bytes that represent max value, if max value is known. + pub fn max_bytes_opt(&self) -> Option<&[u8]> { + statistics_enum_func![self, max_bytes_opt] } /// Returns slice of bytes that represent max value. /// Panics if max value is not set. + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn max_bytes(&self) -> &[u8] { - statistics_enum_func![self, max_bytes] + self.max_bytes_opt().unwrap() } /// Returns physical type associated with statistics. @@ -464,7 +497,7 @@ pub struct ValueStatistics { max: Option, // Distinct count could be omitted in some cases distinct_count: Option, - null_count: u64, + null_count: Option, // Whether or not the min or max values are exact, or truncated. is_max_value_exact: bool, @@ -485,7 +518,7 @@ impl ValueStatistics { min: Option, max: Option, distinct_count: Option, - null_count: u64, + null_count: Option, is_min_max_deprecated: bool, ) -> Self { Self { @@ -538,37 +571,68 @@ impl ValueStatistics { /// /// Panics if min value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. + #[deprecated(since = "53.0.0", note = "Use `min_opt` instead")] pub fn min(&self) -> &T { self.min.as_ref().unwrap() } + /// Returns min value of the statistics, if known. + pub fn min_opt(&self) -> Option<&T> { + self.min.as_ref() + } + /// Returns max value of the statistics. /// /// Panics if max value is not set, e.g. all values are `null`. /// Use `has_min_max_set` method to check that. + #[deprecated(since = "53.0.0", note = "Use `max_opt` instead")] pub fn max(&self) -> &T { self.max.as_ref().unwrap() } + /// Returns max value of the statistics, if known. + pub fn max_opt(&self) -> Option<&T> { + self.max.as_ref() + } + + /// Returns min value as bytes of the statistics, if min value is known. + pub fn min_bytes_opt(&self) -> Option<&[u8]> { + self.min_opt().map(AsBytes::as_bytes) + } + /// Returns min value as bytes of the statistics. /// /// Panics if min value is not set, use `has_min_max_set` method to check /// if values are set. + #[deprecated(since = "53.0.0", note = "Use `min_bytes_opt` instead")] pub fn min_bytes(&self) -> &[u8] { - self.min().as_bytes() + self.min_bytes_opt().unwrap() + } + + /// Returns max value as bytes of the statistics, if max value is known. + pub fn max_bytes_opt(&self) -> Option<&[u8]> { + self.max_opt().map(AsBytes::as_bytes) } /// Returns max value as bytes of the statistics. /// /// Panics if max value is not set, use `has_min_max_set` method to check /// if values are set. + #[deprecated(since = "53.0.0", note = "Use `max_bytes_opt` instead")] pub fn max_bytes(&self) -> &[u8] { - self.max().as_bytes() + self.max_bytes_opt().unwrap() } /// Whether or not min and max values are set. /// Normally both min/max values will be set to `Some(value)` or `None`. + #[deprecated(since = "53.0.0", note = "Use `min_opt` and `max_opt` methods instead")] pub fn has_min_max_set(&self) -> bool { + self._internal_has_min_max_set() + } + + /// Whether or not min and max values are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + pub(crate) fn _internal_has_min_max_set(&self) -> bool { self.min.is_some() && self.max.is_some() } @@ -587,8 +651,16 @@ impl ValueStatistics { self.distinct_count } - /// Returns null count. + /// Returns number of null values for the column. + /// Note that this includes all nulls when column is part of the complex type. + #[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")] pub fn null_count(&self) -> u64 { + // 0 to remain consistent behavior prior to `null_count_opt` + self.null_count_opt().unwrap_or(0) + } + + /// Returns null count. + pub fn null_count_opt(&self) -> Option { self.null_count } @@ -630,7 +702,11 @@ impl fmt::Display for ValueStatistics { Some(value) => write!(f, "{value}")?, None => write!(f, "N/A")?, } - write!(f, ", null_count: {}", self.null_count)?; + write!(f, ", null_count: ")?; + match self.null_count { + Some(value) => write!(f, "{value}")?, + None => write!(f, "N/A")?, + } write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; write!(f, ", max_value_exact: {}", self.is_max_value_exact)?; write!(f, ", min_value_exact: {}", self.is_min_value_exact)?; @@ -642,7 +718,7 @@ impl fmt::Debug for ValueStatistics { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ + "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {:?}, \ min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}", self.min, self.max, @@ -662,21 +738,19 @@ mod tests { #[test] fn test_statistics_min_max_bytes() { - let stats = Statistics::int32(Some(-123), Some(234), None, 1, false); - assert!(stats.has_min_max_set()); - assert_eq!(stats.min_bytes(), (-123).as_bytes()); - assert_eq!(stats.max_bytes(), 234.as_bytes()); + let stats = Statistics::int32(Some(-123), Some(234), None, Some(1), false); + assert_eq!(stats.min_bytes_opt(), Some((-123).as_bytes())); + assert_eq!(stats.max_bytes_opt(), Some(234.as_bytes())); let stats = Statistics::byte_array( Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![3, 4, 5])), None, - 1, + Some(1), true, ); - assert!(stats.has_min_max_set()); - assert_eq!(stats.min_bytes(), &[1, 2, 3]); - assert_eq!(stats.max_bytes(), &[3, 4, 5]); + assert_eq!(stats.min_bytes_opt().unwrap(), &[1, 2, 3]); + assert_eq!(stats.max_bytes_opt().unwrap(), &[3, 4, 5]); } #[test] @@ -704,30 +778,30 @@ mod tests { #[test] fn test_statistics_debug() { - let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); assert_eq!( format!("{stats:?}"), - "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ + "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: Some(12), \ min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})" ); - let stats = Statistics::int32(None, None, None, 7, false); + let stats = Statistics::int32(None, None, None, Some(7), false); assert_eq!( format!("{stats:?}"), - "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ + "Int32({min: None, max: None, distinct_count: None, null_count: Some(7), \ min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})" ) } #[test] fn test_statistics_display() { - let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + let stats = Statistics::int32(Some(1), Some(12), None, Some(12), true); assert_eq!( format!("{stats}"), "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); - let stats = Statistics::int64(None, None, None, 7, false); + let stats = Statistics::int64(None, None, None, Some(7), false); assert_eq!( format!("{stats}"), "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ @@ -738,7 +812,7 @@ mod tests { Some(Int96::from(vec![1, 0, 0])), Some(Int96::from(vec![2, 3, 4])), None, - 3, + Some(3), true, ); assert_eq!( @@ -752,7 +826,7 @@ mod tests { Some(ByteArray::from(vec![1u8])), Some(ByteArray::from(vec![2u8])), Some(5), - 7, + Some(7), false, ) .with_max_is_exact(false) @@ -766,22 +840,22 @@ mod tests { #[test] fn test_statistics_partial_eq() { - let expected = Statistics::int32(Some(12), Some(45), None, 11, true); + let expected = Statistics::int32(Some(12), Some(45), None, Some(11), true); - assert!(Statistics::int32(Some(12), Some(45), None, 11, true) == expected); - assert!(Statistics::int32(Some(11), Some(45), None, 11, true) != expected); - assert!(Statistics::int32(Some(12), Some(44), None, 11, true) != expected); - assert!(Statistics::int32(Some(12), Some(45), None, 23, true) != expected); - assert!(Statistics::int32(Some(12), Some(45), None, 11, false) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(11), true) == expected); + assert!(Statistics::int32(Some(11), Some(45), None, Some(11), true) != expected); + assert!(Statistics::int32(Some(12), Some(44), None, Some(11), true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(23), true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, Some(11), false) != expected); assert!( - Statistics::int32(Some(12), Some(45), None, 11, false) - != Statistics::int64(Some(12), Some(45), None, 11, false) + Statistics::int32(Some(12), Some(45), None, Some(11), false) + != Statistics::int64(Some(12), Some(45), None, Some(11), false) ); assert!( - Statistics::boolean(Some(false), Some(true), None, 0, true) - != Statistics::double(Some(1.2), Some(4.5), None, 0, true) + Statistics::boolean(Some(false), Some(true), None, None, true) + != Statistics::double(Some(1.2), Some(4.5), None, None, true) ); assert!( @@ -789,13 +863,13 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true ) != Statistics::fixed_len_byte_array( Some(ByteArray::from(vec![1, 2, 3]).into()), Some(ByteArray::from(vec![1, 2, 3]).into()), None, - 0, + None, true, ) ); @@ -805,14 +879,14 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) != Statistics::ByteArray( ValueStatistics::new( Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) .with_max_is_exact(false) @@ -824,14 +898,14 @@ mod tests { Some(FixedLenByteArray::from(vec![1, 2, 3])), Some(FixedLenByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) != Statistics::FixedLenByteArray( ValueStatistics::new( Some(FixedLenByteArray::from(vec![1, 2, 3])), Some(FixedLenByteArray::from(vec![1, 2, 3])), None, - 0, + None, true, ) .with_min_is_exact(false) @@ -848,45 +922,123 @@ mod tests { assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats)); } - check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(true), None, 0, false)); - check_stats(Statistics::boolean(Some(true), Some(true), None, 7, true)); - check_stats(Statistics::boolean(Some(false), Some(false), None, 7, true)); - check_stats(Statistics::boolean(None, None, None, 7, true)); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(true), + None, + Some(0), + false, + )); + check_stats(Statistics::boolean( + Some(true), + Some(true), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean( + Some(false), + Some(false), + None, + Some(7), + true, + )); + check_stats(Statistics::boolean(None, None, None, Some(7), true)); - check_stats(Statistics::int32(Some(-100), Some(500), None, 7, true)); - check_stats(Statistics::int32(Some(-100), Some(500), None, 0, false)); - check_stats(Statistics::int32(None, None, None, 7, true)); + check_stats(Statistics::int32( + Some(-100), + Some(500), + None, + Some(7), + true, + )); + check_stats(Statistics::int32( + Some(-100), + Some(500), + None, + Some(0), + false, + )); + check_stats(Statistics::int32(None, None, None, Some(7), true)); - check_stats(Statistics::int64(Some(-100), Some(200), None, 7, true)); - check_stats(Statistics::int64(Some(-100), Some(200), None, 0, false)); - check_stats(Statistics::int64(None, None, None, 7, true)); + check_stats(Statistics::int64( + Some(-100), + Some(200), + None, + Some(7), + true, + )); + check_stats(Statistics::int64( + Some(-100), + Some(200), + None, + Some(0), + false, + )); + check_stats(Statistics::int64(None, None, None, Some(7), true)); - check_stats(Statistics::float(Some(1.2), Some(3.4), None, 7, true)); - check_stats(Statistics::float(Some(1.2), Some(3.4), None, 0, false)); - check_stats(Statistics::float(None, None, None, 7, true)); + check_stats(Statistics::float(Some(1.2), Some(3.4), None, Some(7), true)); + check_stats(Statistics::float( + Some(1.2), + Some(3.4), + None, + Some(0), + false, + )); + check_stats(Statistics::float(None, None, None, Some(7), true)); - check_stats(Statistics::double(Some(1.2), Some(3.4), None, 7, true)); - check_stats(Statistics::double(Some(1.2), Some(3.4), None, 0, false)); - check_stats(Statistics::double(None, None, None, 7, true)); + check_stats(Statistics::double( + Some(1.2), + Some(3.4), + None, + Some(7), + true, + )); + check_stats(Statistics::double( + Some(1.2), + Some(3.4), + None, + Some(0), + false, + )); + check_stats(Statistics::double(None, None, None, Some(7), true)); check_stats(Statistics::byte_array( Some(ByteArray::from(vec![1, 2, 3])), Some(ByteArray::from(vec![3, 4, 5])), None, - 7, + Some(7), true, )); - check_stats(Statistics::byte_array(None, None, None, 7, true)); + check_stats(Statistics::byte_array(None, None, None, Some(7), true)); check_stats(Statistics::fixed_len_byte_array( Some(ByteArray::from(vec![1, 2, 3]).into()), Some(ByteArray::from(vec![3, 4, 5]).into()), None, - 7, + Some(7), + true, + )); + check_stats(Statistics::fixed_len_byte_array( + None, + None, + None, + Some(7), true, )); - check_stats(Statistics::fixed_len_byte_array(None, None, None, 7, true)); } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 23ac3556bdf3..a23a2a3327e7 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1122,7 +1122,7 @@ mod tests { encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, Page::DataPageV2 { buf: Bytes::from(vec![4; 128]), @@ -1133,7 +1133,7 @@ mod tests { def_levels_byte_len: 24, rep_levels_byte_len: 32, is_compressed: false, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, ]; @@ -1156,7 +1156,7 @@ mod tests { encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, - statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + statistics: Some(Statistics::int32(Some(1), Some(3), None, Some(7), true)), }, Page::DataPageV2 { buf: Bytes::from(vec![4; 128]),