From e5d9816dfb7fc776732526acac3905fcb2764ef4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Aug 2024 11:38:06 -0400 Subject: [PATCH] Add `Statistics::distinct_count_opt` and deprecate `Statistics::distinct_count` (#6259) --- parquet/src/column/writer/mod.rs | 18 +++++++++--------- parquet/src/file/statistics.rs | 9 ++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8ea2878317e8..606ad462d143 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1842,7 +1842,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &1); assert_eq!(stats.max_opt().unwrap(), &4); @@ -1968,7 +1968,7 @@ mod tests { assert_eq!(metadata.dictionary_page_offset(), Some(0)); if let Some(stats) = metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count().unwrap_or(0), 55); + assert_eq!(stats.distinct_count_opt().unwrap_or(0), 55); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min_opt().unwrap(), &-17); assert_eq!(stats.max_opt().unwrap(), &9000); @@ -1999,7 +1999,7 @@ mod tests { assert_eq!(stats.min_bytes_opt().unwrap(), 1_i32.to_le_bytes()); assert_eq!(stats.max_bytes_opt().unwrap(), 7_i32.to_le_bytes()); assert_eq!(stats.null_count_opt(), Some(0)); - assert!(stats.distinct_count().is_none()); + assert!(stats.distinct_count_opt().is_none()); drop(write); @@ -2031,7 +2031,7 @@ mod tests { 7_i32.to_le_bytes() ); assert_eq!(page_statistics.null_count_opt(), Some(0)); - assert!(page_statistics.distinct_count().is_none()); + assert!(page_statistics.distinct_count_opt().is_none()); } #[test] @@ -2698,7 +2698,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::Int32(stats) = stats { // first page is [1,2,3,4] // second page is [-5,2,4,8] @@ -2758,7 +2758,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::FixedLenByteArray(stats) = stats { let column_index_min_value = &column_index.min_values[0]; let column_index_max_value = &column_index.max_values[0]; @@ -2830,7 +2830,7 @@ mod tests { if let Some(stats) = r.metadata.statistics() { assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let column_index_min_value = &column_index.min_values[0]; let column_index_max_value = &column_index.max_values[0]; @@ -2951,7 +2951,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::ByteArray(_stats) = stats { let min_value = _stats.min_opt().unwrap(); let max_value = _stats.max_opt().unwrap(); @@ -3003,7 +3003,7 @@ mod tests { let stats = r.metadata.statistics().expect("statistics"); assert_eq!(stats.null_count_opt(), Some(0)); - assert_eq!(stats.distinct_count(), None); + assert_eq!(stats.distinct_count_opt(), None); if let Statistics::FixedLenByteArray(_stats) = stats { let min_value = _stats.min_opt().unwrap(); let max_value = _stats.max_opt().unwrap(); diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 4134685ffcfb..680c75d6b2e5 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -257,7 +257,7 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { max: None, min: None, null_count, - distinct_count: stats.distinct_count().map(|value| value as i64), + distinct_count: stats.distinct_count_opt().map(|value| value as i64), max_value: None, min_value: None, is_max_value_exact: None, @@ -380,7 +380,14 @@ impl Statistics { /// Returns optional value of number of distinct values occurring. /// When it is `None`, the value should be ignored. + #[deprecated(since = "53.0.0", note = "Use `distinct_count_opt` method instead")] pub fn distinct_count(&self) -> Option { + self.distinct_count_opt() + } + + /// Returns optional value of number of distinct values occurring. + /// When it is `None`, the value should be ignored. + pub fn distinct_count_opt(&self) -> Option { statistics_enum_func![self, distinct_count] }