diff --git a/Cargo.toml b/Cargo.toml index b7c8c09a8537..96111ce7bb66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,21 +77,21 @@ version = "43.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "53.3.0", features = [ +arrow = { version = "54.0.0", features = [ "prettyprint", ] } -arrow-array = { version = "53.3.0", default-features = false, features = [ +arrow-array = { version = "54.0.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "53.3.0", default-features = false } -arrow-flight = { version = "53.3.0", features = [ +arrow-buffer = { version = "54.0.0", default-features = false } +arrow-flight = { version = "54.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "53.3.0", default-features = false, features = [ +arrow-ipc = { version = "54.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "53.3.0", default-features = false } -arrow-schema = { version = "53.3.0", default-features = false } +arrow-ord = { version = "54.0.0", default-features = false } +arrow-schema = { version = "54.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "0.4.7" bytes = "1.4" @@ -133,7 +133,7 @@ itertools = "0.13" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "53.3.0", default-features = false, features = [ +parquet = { version = "54.0.0", default-features = false, features = [ "arrow", "async", "object_store", @@ -177,3 +177,18 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +parquet = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9549cfeeb3b8..bcbaae0d2462 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -174,9 +174,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-arith", "arrow-array", @@ -195,24 +194,21 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-buffer", @@ -227,9 +223,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "bytes", "half", @@ -238,9 +233,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -259,28 +253,23 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-buffer", "arrow-schema", @@ -290,13 +279,11 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -305,9 +292,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -325,26 +311,21 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -354,15 +335,13 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" [[package]] name = "arrow-select" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-array", @@ -374,9 +353,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -2885,9 +2863,8 @@ dependencies = [ [[package]] name = "parquet" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-array", @@ -4737,3 +4714,8 @@ dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow-flight" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 4cdc2120a029..0354a08cf343 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -30,7 +30,7 @@ rust-version = "1.80.1" readme = "README.md" [dependencies] -arrow = { version = "53.0.0" } +arrow = { version = "54.0.0" } async-trait = "0.1.73" aws-config = "1.5.5" aws-sdk-sso = "1.43.0" @@ -58,7 +58,7 @@ home = "=0.5.9" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } -parquet = { version = "53.0.0", default-features = false } +parquet = { version = "54.0.0", default-features = false } regex = "1.8" rustyline = "14.0" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } @@ -69,3 +69,18 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.22" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +parquet = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index a81ec724dd66..6f2d42a4496d 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -61,7 +61,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.22.0", optional = true } +pyo3 = { version = "0.23.3", optional = true } recursive = { workspace = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 6e64700bd2e0..ae8b9c8a1858 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -459,10 +459,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - pub max_statistics_size: Option, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -1653,10 +1649,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index dd9d67d6bb47..fab50750eb41 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -26,7 +26,7 @@ use parquet::{ basic::{BrotliLevel, GzipLevel, ZstdLevel}, file::properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, format::KeyValue, schema::types::ColumnPath, @@ -129,11 +129,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - if let Some(max_statistics_size) = options.max_statistics_size { - builder = - builder.set_column_max_statistics_size(path, max_statistics_size); - } } Ok(builder) @@ -154,7 +149,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -190,9 +184,6 @@ impl ParquetOptions { .and_then(|s| parse_statistics_string(s).ok()) .unwrap_or(DEFAULT_STATISTICS_ENABLED), ) - .set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) .set_max_row_group_size(*max_row_group_size) .set_created_by(created_by.clone()) .set_column_index_truncate_length(*column_index_truncate_length) @@ -395,7 +386,6 @@ mod tests { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -419,7 +409,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -473,7 +462,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -523,7 +511,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index bdcf831c7884..60dde7861104 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -23,7 +23,7 @@ use arrow_array::Array; use pyo3::exceptions::PyException; use pyo3::prelude::PyErr; use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPy, PyAny, PyObject, PyResult, Python}; +use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python}; use crate::{DataFusionError, ScalarValue}; @@ -40,8 +40,8 @@ impl FromPyArrow for ScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface @@ -69,14 +69,25 @@ impl<'source> FromPyObject<'source> for ScalarValue { } } -impl IntoPy for ScalarValue { - fn into_py(self, py: Python) -> PyObject { - self.to_pyarrow(py).unwrap() +impl<'source> IntoPyObject<'source> for ScalarValue { + type Target = PyAny; + + type Output = Bound<'source, Self::Target>; + + type Error = PyErr; + + fn into_pyobject(self, py: Python<'source>) -> Result { + let array = self.to_array()?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyarray_bound = pyarray.bind(py); + pyarray_bound.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { + use pyo3::ffi::c_str; use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; @@ -86,10 +97,12 @@ mod tests { fn init_python() { prepare_freethreaded_python(); Python::with_gil(|py| { - if py.run_bound("import pyarrow", None, None).is_err() { - let locals = PyDict::new_bound(py); - py.run_bound( - "import sys; executable = sys.executable; python_path = sys.path", + if py.run(c_str!("import pyarrow"), None, None).is_err() { + let locals = PyDict::new(py); + py.run( + c_str!( + "import sys; executable = sys.executable; python_path = sys.path" + ), None, Some(&locals), ) @@ -135,17 +148,25 @@ mod tests { } #[test] - fn test_py_scalar() { + fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| { + Python::with_gil(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); - let py_float = scalar_float.into_py(py).call_method0(py, "as_py").unwrap(); + let py_float = scalar_float + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_float, "assert py_float == 12.34"); let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string())); - let py_string = scalar_string.into_py(py).call_method0(py, "as_py").unwrap(); + let py_string = scalar_string + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_string, "assert py_string == 'Hello!'"); - }); + + Ok(()) + }) } } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 383fd6575234..ae52aaaf482d 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -70,9 +70,7 @@ use parquet::arrow::arrow_writer::{ compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, }; -use parquet::arrow::{ - arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, -}; +use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter}; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; @@ -934,7 +932,7 @@ fn spawn_column_parallel_row_group_writer( max_buffer_size: usize, pool: &Arc, ) -> Result<(Vec, Vec)> { - let schema_desc = arrow_to_parquet_schema(&schema)?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; let num_columns = col_writers.len(); @@ -1137,7 +1135,7 @@ async fn concatenate_parallel_row_groups( let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); - let schema_desc = arrow_to_parquet_schema(schema.as_ref())?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let mut parquet_writer = SerializedFileWriter::new( merged_buff.clone(), schema_desc.root_schema_ptr(), diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index 810f74e8515b..b28128239408 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -456,8 +456,8 @@ mod tests { use datafusion_expr::{cast, col, lit, Expr}; use datafusion_physical_expr::planner::logical2physical; - use parquet::arrow::arrow_to_parquet_schema; use parquet::arrow::async_reader::ParquetObjectReader; + use parquet::arrow::ArrowSchemaConverter; use parquet::basic::LogicalType; use parquet::data_type::{ByteArray, FixedLenByteArray}; use parquet::file::metadata::ColumnChunkMetaData; @@ -744,7 +744,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1").gt(lit(15)).and(col("c2").is_null()); let expr = logical2physical(&expr, &schema); let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); @@ -773,7 +773,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1") .gt(lit(15)) .and(col("c2").eq(lit(ScalarValue::Boolean(None)))); diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 2e0103defd9f..0ff8e37b9504 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -2438,25 +2438,21 @@ mod tests { "labels".to_string(), DataType::Struct( vec![ - Field::new_dict( + Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), - Field::new_dict( + Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), ] .into(), @@ -2468,15 +2464,13 @@ mod tests { vec![ Arc::new(StructArray::from(vec![ ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("a"), None, Some("a")] @@ -2485,15 +2479,13 @@ mod tests { ) as ArrayRef, ), ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("b"), Some("c"), Some("b")] diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 69626f97fd80..fbe40f6b575f 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -473,10 +473,6 @@ message ParquetColumnOptions { oneof bloom_filter_ndv_opt { uint64 bloom_filter_ndv = 7; } - - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } } message ParquetOptions { @@ -514,10 +510,6 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } - oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; } diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index eb6976aa0c06..6da341d86811 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -321,6 +321,8 @@ impl TryFrom<&protobuf::Field> for Field { fn try_from(field: &protobuf::Field) -> Result { let datatype = field.arrow_type.as_deref().required("arrow_type")?; let field = if field.dict_id != 0 { + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Self::new_dict( field.name.as_str(), datatype, @@ -365,6 +367,8 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { .as_ref() .ok_or_else(|| Error::required("value"))?; + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Ok(match value { Value::BoolValue(v) => Self::Boolean(Some(*v)), Value::Utf8Value(v) => Self::Utf8(Some(v.to_owned())), @@ -922,12 +926,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -982,12 +980,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index e88c1497af08..2b39065a7a65 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -4466,9 +4466,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4521,13 +4518,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4550,8 +4540,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4563,7 +4551,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4592,7 +4579,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4619,7 +4605,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4664,12 +4649,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4680,7 +4659,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -4964,9 +4942,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5081,15 +5056,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5176,8 +5142,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "encoding", @@ -5212,7 +5176,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, Encoding, BloomFilterFpp, @@ -5261,7 +5224,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "encoding" => Ok(GeneratedField::Encoding), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), @@ -5308,7 +5270,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut encoding_opt__ = None; let mut bloom_filter_fpp_opt__ = None; @@ -5467,12 +5428,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5523,7 +5478,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, encoding_opt: encoding_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 6b8509775847..1d2c44ce96d6 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -664,10 +664,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -706,11 +702,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -785,10 +776,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -823,11 +810,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index a7cea607cb6d..bfa96696dfde 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -97,6 +97,8 @@ impl TryFrom<&Field> for protobuf::Field { nullable: field.is_nullable(), children: Vec::new(), metadata: field.metadata().clone(), + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] dict_id: field.dict_id().unwrap_or(0), dict_ordered: field.dict_is_ordered().unwrap_or(false), }) @@ -818,7 +820,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -855,11 +856,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index 6b8509775847..1d2c44ce96d6 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -664,10 +664,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -706,11 +702,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -785,10 +776,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -823,11 +810,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 62405b2fef21..5c02170e8f60 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -385,9 +385,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -436,9 +433,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -476,9 +470,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -529,11 +520,6 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index c0885ece08bc..a2d1a28bb8e1 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1777,6 +1777,8 @@ fn round_trip_datatype() { } } +// TODO file a ticket about handling deprecated dict_id attributes +#[allow(deprecated)] #[test] fn roundtrip_dict_id() -> Result<()> { let dict_id = 42; diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index caa708483a11..498903927e7a 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -320,7 +320,6 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, -'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value' diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 7d70cd9db53e..a02975b2ef85 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -207,7 +207,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -300,7 +299,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer