Skip to content

Commit

Permalink
Ensure statistic defaults in parquet writers are in sync (apache#11656)
Browse files Browse the repository at this point in the history
* test(11367): update tests to indicate that the diffferent compression setting is expected

* chore(11367): update default settings for statistics_enabled

* test(11367): fix test cases, as we set the same variable for from_datafusion_defaults twice (in both the original DF options and in the builder too); only should set once

* test(11367): fix bug in the test case, should have set the arrow-rs to true (default is false)

* test(11367): fix test for fpp and ndv as defaults, when bloom filter turned on

* test(11367): update readme and sqllogictests for updated default config
  • Loading branch information
wiedld authored Jul 27, 2024
1 parent 42732b5 commit a598739
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 91 deletions.
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ config_namespace! {
/// Valid values are: "none", "chunk", and "page"
/// These values are not case sensitive. If NULL, uses
/// default parquet writer setting
pub statistics_enabled: Option<String>, default = None
pub statistics_enabled: Option<String>, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
Expand Down
118 changes: 31 additions & 87 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ mod tests {
ColumnOptions {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("page".into()),
statistics_enabled: Some("none".into()),
max_statistics_size: Some(72),
encoding: Some("RLE".into()),
bloom_filter_enabled: Some(true),
Expand Down Expand Up @@ -614,23 +614,7 @@ mod tests {
"should indicate that table_parquet_opts defaults came from datafusion",
);

// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone();
let mut from_extern_parquet =
session_config_from_writer_props(&default_writer_props);
from_extern_parquet.global.created_by = same_created_by;
// TODO: the remaining defaults do not match!
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
default_table_writer_opts,
from_extern_parquet,
"the default writer_props should have the same configuration as the session's default TableParquetOptions",
);

// Below here itemizes how the defaults **should** match, but do not.

// TODO: compression defaults do not match
// refer to https://github.com/apache/datafusion/issues/11367
// Expected: the datafusion default compression is different from arrow-rs's parquet
assert_eq!(
default_writer_props.compression(&"default".into()),
Compression::UNCOMPRESSED,
Expand All @@ -644,35 +628,13 @@ mod tests {
"datafusion's default is zstd"
);

// datafusion's `None` for Option<String> => becomes parquet's EnabledStatistics::Page
// TODO: should this be changed?
// refer to https://github.com/apache/datafusion/issues/11367
assert_eq!(
default_writer_props.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"extern parquet's default is page"
);
assert_eq!(
default_table_writer_opts.global.statistics_enabled, None,
"datafusion's has no default"
);
assert_eq!(
from_datafusion_defaults.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"should see the extern parquet's default over-riding datafusion's None",
);

// Confirm all other settings are equal.
// First resolve the known discrepancies, (set as the same).
// TODO: once we fix the above mis-matches, we should be able to remove this.
// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone();
let mut from_extern_parquet =
session_config_from_writer_props(&default_writer_props);
from_extern_parquet.global.created_by = same_created_by;
from_extern_parquet.global.compression = Some("zstd(3)".into());
from_extern_parquet.global.statistics_enabled = None;

// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different
from_extern_parquet.global.created_by = same_created_by; // we expect these to be different
assert_eq!(
default_table_writer_opts,
from_extern_parquet,
Expand All @@ -685,31 +647,25 @@ mod tests {
// the TableParquetOptions::default, with only the bloom filter turned on
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;

// the WriterProperties::default, with only the bloom filter turned on
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only the bloom filter turned on
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties::default()),
"datafusion's has BloomFilterProperties::default",
"should use the default bloom filter props"
);
}

Expand All @@ -719,35 +675,29 @@ mod tests {
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_fpp = Some(0.42);

// the WriterProperties::default, with only fpp set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_fpp(0.42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only fpp set
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.set_bloom_filter_fpp(0.42)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: 0.42,
ndv: DEFAULT_BLOOM_FILTER_NDV
}),
"datafusion's has BloomFilterProperties",
"should have only the fpp set, and the ndv at default",
);
}

Expand All @@ -757,35 +707,29 @@ mod tests {
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_ndv = Some(42);

// the WriterProperties::default, with only ndv set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_ndv(42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
// the WriterProperties::default, with only ndv set
let default_writer_props = WriterProperties::builder()
.set_bloom_filter_enabled(true)
.set_bloom_filter_ndv(42)
.build();

assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
"parquet and datafusion props, should have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: DEFAULT_BLOOM_FILTER_FPP,
ndv: 42
}),
"datafusion's has BloomFilterProperties",
"should have only the ndv set, and the fpp at default",
);
}
}
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ datafusion.execution.parquet.pruning true
datafusion.execution.parquet.pushdown_filters false
datafusion.execution.parquet.reorder_filters false
datafusion.execution.parquet.skip_metadata true
datafusion.execution.parquet.statistics_enabled NULL
datafusion.execution.parquet.statistics_enabled page
datafusion.execution.parquet.write_batch_size 1024
datafusion.execution.parquet.writer_version 1.0
datafusion.execution.planning_concurrency 13
Expand Down Expand Up @@ -288,7 +288,7 @@ datafusion.execution.parquet.pruning true (reading) If true, the parquet reader
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
datafusion.execution.parquet.statistics_enabled NULL (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes
datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0"
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. |
| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes |
| datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. |
| datafusion.execution.parquet.created_by | datafusion version 40.0.0 | (writing) Sets "created by" property |
Expand Down

0 comments on commit a598739

Please sign in to comment.