Skip to content

Commit

Permalink
remove unused max_statistics_size field
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Dec 17, 2024
1 parent 3d6002a commit fb080a2
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 184 deletions.
8 changes: 0 additions & 8 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -446,10 +446,6 @@ config_namespace! {
/// default parquet writer setting
pub statistics_enabled: Option<String>, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
pub max_statistics_size: Option<usize>, default = Some(4096)

/// (writing) Target maximum number of rows in each row group (defaults to 1M
/// rows). Writing larger row groups requires more memory to write, but
/// can get better compression and be faster to read.
Expand Down Expand Up @@ -1621,10 +1617,6 @@ config_namespace_with_hashmap! {
/// Sets bloom filter number of distinct values. If NULL, uses
/// default parquet options
pub bloom_filter_ndv: Option<u64>, default = None

/// Sets max statistics size for the column path. If NULL, uses
/// default parquet options
pub max_statistics_size: Option<usize>, default = None
}
}

Expand Down
15 changes: 1 addition & 14 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use parquet::{
basic::{BrotliLevel, GzipLevel, ZstdLevel},
file::properties::{
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
DEFAULT_STATISTICS_ENABLED,
},
format::KeyValue,
schema::types::ColumnPath,
Expand Down Expand Up @@ -129,11 +129,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
builder =
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
}

if let Some(max_statistics_size) = options.max_statistics_size {
builder =
builder.set_column_max_statistics_size(path, max_statistics_size);
}
}

Ok(builder)
Expand All @@ -154,7 +149,6 @@ impl ParquetOptions {
dictionary_enabled,
dictionary_page_size_limit,
statistics_enabled,
max_statistics_size,
max_row_group_size,
created_by,
column_index_truncate_length,
Expand Down Expand Up @@ -190,9 +184,6 @@ impl ParquetOptions {
.and_then(|s| parse_statistics_string(s).ok())
.unwrap_or(DEFAULT_STATISTICS_ENABLED),
)
.set_max_statistics_size(
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
)
.set_max_row_group_size(*max_row_group_size)
.set_created_by(created_by.clone())
.set_column_index_truncate_length(*column_index_truncate_length)
Expand Down Expand Up @@ -395,7 +386,6 @@ mod tests {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("none".into()),
max_statistics_size: Some(72),
encoding: Some("RLE".into()),
bloom_filter_enabled: Some(true),
bloom_filter_fpp: Some(0.72),
Expand All @@ -419,7 +409,6 @@ mod tests {
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
dictionary_page_size_limit: 42,
statistics_enabled: Some("chunk".into()),
max_statistics_size: Some(42),
max_row_group_size: 42,
created_by: "wordy".into(),
column_index_truncate_length: Some(42),
Expand Down Expand Up @@ -473,7 +462,6 @@ mod tests {
),
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
max_statistics_size: Some(props.max_statistics_size(&col)),
}
}

Expand Down Expand Up @@ -523,7 +511,6 @@ mod tests {
compression: default_col_props.compression,
dictionary_enabled: default_col_props.dictionary_enabled,
statistics_enabled: default_col_props.statistics_enabled,
max_statistics_size: default_col_props.max_statistics_size,
bloom_filter_on_write: default_col_props
.bloom_filter_enabled
.unwrap_or_default(),
Expand Down
8 changes: 0 additions & 8 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -472,10 +472,6 @@ message ParquetColumnOptions {
oneof bloom_filter_ndv_opt {
uint64 bloom_filter_ndv = 7;
}

oneof max_statistics_size_opt {
uint32 max_statistics_size = 8;
}
}

message ParquetOptions {
Expand Down Expand Up @@ -513,10 +509,6 @@ message ParquetOptions {
string statistics_enabled = 13;
}

oneof max_statistics_size_opt {
uint64 max_statistics_size = 14;
}

oneof column_index_truncate_length_opt {
uint64 column_index_truncate_length = 17;
}
Expand Down
12 changes: 0 additions & 12 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -924,12 +924,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
})
.unwrap_or(None),
max_statistics_size: value
.max_statistics_size_opt.as_ref()
.map(|opt| match opt {
protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize),
})
.unwrap_or(None),
max_row_group_size: value.max_row_group_size as usize,
created_by: value.created_by.clone(),
column_index_truncate_length: value
Expand Down Expand Up @@ -984,12 +978,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions {
protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v),
})
.unwrap_or(None),
max_statistics_size: value
.max_statistics_size_opt
.map(|opt| match opt {
protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize),
})
.unwrap_or(None),
encoding: value
.encoding_opt.clone()
.map(|opt| match opt {
Expand Down
46 changes: 0 additions & 46 deletions datafusion/proto-common/src/generated/pbjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4448,9 +4448,6 @@ impl serde::Serialize for ParquetColumnOptions {
if self.bloom_filter_ndv_opt.is_some() {
len += 1;
}
if self.max_statistics_size_opt.is_some() {
len += 1;
}
let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?;
if let Some(v) = self.bloom_filter_enabled_opt.as_ref() {
match v {
Expand Down Expand Up @@ -4503,13 +4500,6 @@ impl serde::Serialize for ParquetColumnOptions {
}
}
}
if let Some(v) = self.max_statistics_size_opt.as_ref() {
match v {
parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
struct_ser.serialize_field("maxStatisticsSize", v)?;
}
}
}
struct_ser.end()
}
}
Expand All @@ -4532,8 +4522,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
"bloomFilterFpp",
"bloom_filter_ndv",
"bloomFilterNdv",
"max_statistics_size",
"maxStatisticsSize",
];

#[allow(clippy::enum_variant_names)]
Expand All @@ -4545,7 +4533,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
StatisticsEnabled,
BloomFilterFpp,
BloomFilterNdv,
MaxStatisticsSize,
}
impl<'de> serde::Deserialize<'de> for GeneratedField {
fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error>
Expand Down Expand Up @@ -4574,7 +4561,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
"statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
"bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
"bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv),
"maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
_ => Err(serde::de::Error::unknown_field(value, FIELDS)),
}
}
Expand All @@ -4601,7 +4587,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
let mut statistics_enabled_opt__ = None;
let mut bloom_filter_fpp_opt__ = None;
let mut bloom_filter_ndv_opt__ = None;
let mut max_statistics_size_opt__ = None;
while let Some(k) = map_.next_key()? {
match k {
GeneratedField::BloomFilterEnabled => {
Expand Down Expand Up @@ -4646,12 +4631,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
}
bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0));
}
GeneratedField::MaxStatisticsSize => {
if max_statistics_size_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
}
max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
}
}
}
Ok(ParquetColumnOptions {
Expand All @@ -4662,7 +4641,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions {
statistics_enabled_opt: statistics_enabled_opt__,
bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
bloom_filter_ndv_opt: bloom_filter_ndv_opt__,
max_statistics_size_opt: max_statistics_size_opt__,
})
}
}
Expand Down Expand Up @@ -4946,9 +4924,6 @@ impl serde::Serialize for ParquetOptions {
if self.statistics_enabled_opt.is_some() {
len += 1;
}
if self.max_statistics_size_opt.is_some() {
len += 1;
}
if self.column_index_truncate_length_opt.is_some() {
len += 1;
}
Expand Down Expand Up @@ -5063,15 +5038,6 @@ impl serde::Serialize for ParquetOptions {
}
}
}
if let Some(v) = self.max_statistics_size_opt.as_ref() {
match v {
parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => {
#[allow(clippy::needless_borrow)]
#[allow(clippy::needless_borrows_for_generic_args)]
struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?;
}
}
}
if let Some(v) = self.column_index_truncate_length_opt.as_ref() {
match v {
parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => {
Expand Down Expand Up @@ -5158,8 +5124,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
"dictionaryEnabled",
"statistics_enabled",
"statisticsEnabled",
"max_statistics_size",
"maxStatisticsSize",
"column_index_truncate_length",
"columnIndexTruncateLength",
"encoding",
Expand Down Expand Up @@ -5194,7 +5158,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
Compression,
DictionaryEnabled,
StatisticsEnabled,
MaxStatisticsSize,
ColumnIndexTruncateLength,
Encoding,
BloomFilterFpp,
Expand Down Expand Up @@ -5243,7 +5206,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
"compression" => Ok(GeneratedField::Compression),
"dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled),
"statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled),
"maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize),
"columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength),
"encoding" => Ok(GeneratedField::Encoding),
"bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp),
Expand Down Expand Up @@ -5290,7 +5252,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
let mut compression_opt__ = None;
let mut dictionary_enabled_opt__ = None;
let mut statistics_enabled_opt__ = None;
let mut max_statistics_size_opt__ = None;
let mut column_index_truncate_length_opt__ = None;
let mut encoding_opt__ = None;
let mut bloom_filter_fpp_opt__ = None;
Expand Down Expand Up @@ -5449,12 +5410,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
}
statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled);
}
GeneratedField::MaxStatisticsSize => {
if max_statistics_size_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("maxStatisticsSize"));
}
max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0));
}
GeneratedField::ColumnIndexTruncateLength => {
if column_index_truncate_length_opt__.is_some() {
return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength"));
Expand Down Expand Up @@ -5505,7 +5460,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
compression_opt: compression_opt__,
dictionary_enabled_opt: dictionary_enabled_opt__,
statistics_enabled_opt: statistics_enabled_opt__,
max_statistics_size_opt: max_statistics_size_opt__,
column_index_truncate_length_opt: column_index_truncate_length_opt__,
encoding_opt: encoding_opt__,
bloom_filter_fpp_opt: bloom_filter_fpp_opt__,
Expand Down
Loading

0 comments on commit fb080a2

Please sign in to comment.