From 5554b92310056ba143105b93014c7cd0cb7f557c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 09:30:36 -0500 Subject: [PATCH 1/7] Temp patch to new arrow-rs release --- Cargo.toml | 31 +++++++++---- datafusion-cli/Cargo.lock | 88 ++++++++++++++---------------------- datafusion-cli/Cargo.toml | 19 +++++++- datafusion/common/Cargo.toml | 2 +- 4 files changed, 76 insertions(+), 64 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b7c8c09a8537..96111ce7bb66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,21 +77,21 @@ version = "43.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "53.3.0", features = [ +arrow = { version = "54.0.0", features = [ "prettyprint", ] } -arrow-array = { version = "53.3.0", default-features = false, features = [ +arrow-array = { version = "54.0.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "53.3.0", default-features = false } -arrow-flight = { version = "53.3.0", features = [ +arrow-buffer = { version = "54.0.0", default-features = false } +arrow-flight = { version = "54.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "53.3.0", default-features = false, features = [ +arrow-ipc = { version = "54.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "53.3.0", default-features = false } -arrow-schema = { version = "53.3.0", default-features = false } +arrow-ord = { version = "54.0.0", default-features = false } +arrow-schema = { version = "54.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "0.4.7" bytes = "1.4" @@ -133,7 +133,7 @@ itertools = "0.13" log = "^0.4" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "53.3.0", default-features = false, features = [ +parquet = { version = "54.0.0", default-features = false, features = [ "arrow", "async", "object_store", @@ -177,3 +177,18 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +parquet = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a435869dbece..49b529facc9d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -174,9 +174,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-arith", "arrow-array", @@ -195,24 +194,21 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-buffer", @@ -227,9 +223,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "bytes", "half", @@ -238,9 +233,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -259,28 +253,23 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-buffer", "arrow-schema", @@ -290,13 +279,11 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -305,9 +292,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -325,26 +311,21 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -354,15 +335,13 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" [[package]] name = "arrow-select" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-array", @@ -374,9 +353,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "arrow-array", "arrow-buffer", @@ -2875,9 +2853,8 @@ dependencies = [ [[package]] name = "parquet" -version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" dependencies = [ "ahash", "arrow-array", @@ -4727,3 +4704,8 @@ dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow-flight" +version = "54.0.0" +source = "git+https://github.com/apache/arrow-rs.git?tag=54.0.0#2887cc1030b2954ffcaba30f6a6d566b7017dc25" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 4cdc2120a029..0354a08cf343 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -30,7 +30,7 @@ rust-version = "1.80.1" readme = "README.md" [dependencies] -arrow = { version = "53.0.0" } +arrow = { version = "54.0.0" } async-trait = "0.1.73" aws-config = "1.5.5" aws-sdk-sso = "1.43.0" @@ -58,7 +58,7 @@ home = "=0.5.9" mimalloc = { version = "0.1", default-features = false } object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } -parquet = { version = "53.0.0", default-features = false } +parquet = { version = "54.0.0", default-features = false } regex = "1.8" rustyline = "14.0" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } @@ -69,3 +69,18 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.22" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } +parquet = { git = "https://github.com/apache/arrow-rs.git", tag = "54.0.0" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index a81ec724dd66..6f2d42a4496d 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -61,7 +61,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.22.0", optional = true } +pyo3 = { version = "0.23.3", optional = true } recursive = { workspace = true } sqlparser = { workspace = true } tokio = { workspace = true } From 4991cc4c1e564e7480265ecbe5cc60cf4ee3f9ec Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 09:45:05 -0500 Subject: [PATCH 2/7] chore: Update for API changes --- datafusion/core/src/datasource/file_format/parquet.rs | 8 +++----- .../datasource/physical_plan/parquet/row_group_filter.rs | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 383fd6575234..ae52aaaf482d 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -70,9 +70,7 @@ use parquet::arrow::arrow_writer::{ compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, }; -use parquet::arrow::{ - arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, -}; +use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter}; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; @@ -934,7 +932,7 @@ fn spawn_column_parallel_row_group_writer( max_buffer_size: usize, pool: &Arc, ) -> Result<(Vec, Vec)> { - let schema_desc = arrow_to_parquet_schema(&schema)?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; let num_columns = col_writers.len(); @@ -1137,7 +1135,7 @@ async fn concatenate_parallel_row_groups( let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); - let schema_desc = arrow_to_parquet_schema(schema.as_ref())?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let mut parquet_writer = SerializedFileWriter::new( merged_buff.clone(), schema_desc.root_schema_ptr(), diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index 810f74e8515b..b28128239408 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -456,8 +456,8 @@ mod tests { use datafusion_expr::{cast, col, lit, Expr}; use datafusion_physical_expr::planner::logical2physical; - use parquet::arrow::arrow_to_parquet_schema; use parquet::arrow::async_reader::ParquetObjectReader; + use parquet::arrow::ArrowSchemaConverter; use parquet::basic::LogicalType; use parquet::data_type::{ByteArray, FixedLenByteArray}; use parquet::file::metadata::ColumnChunkMetaData; @@ -744,7 +744,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1").gt(lit(15)).and(col("c2").is_null()); let expr = logical2physical(&expr, &schema); let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); @@ -773,7 +773,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1") .gt(lit(15)) .and(col("c2").eq(lit(ScalarValue::Boolean(None)))); From f13b588a10eee5bb0dddb44b2a574137837bd9c4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 7 Dec 2024 07:13:20 -0500 Subject: [PATCH 3/7] Update for new pyo3 API --- datafusion/common/src/pyarrow.rs | 51 ++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index bdcf831c7884..60dde7861104 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -23,7 +23,7 @@ use arrow_array::Array; use pyo3::exceptions::PyException; use pyo3::prelude::PyErr; use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPy, PyAny, PyObject, PyResult, Python}; +use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python}; use crate::{DataFusionError, ScalarValue}; @@ -40,8 +40,8 @@ impl FromPyArrow for ScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface @@ -69,14 +69,25 @@ impl<'source> FromPyObject<'source> for ScalarValue { } } -impl IntoPy for ScalarValue { - fn into_py(self, py: Python) -> PyObject { - self.to_pyarrow(py).unwrap() +impl<'source> IntoPyObject<'source> for ScalarValue { + type Target = PyAny; + + type Output = Bound<'source, Self::Target>; + + type Error = PyErr; + + fn into_pyobject(self, py: Python<'source>) -> Result { + let array = self.to_array()?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyarray_bound = pyarray.bind(py); + pyarray_bound.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { + use pyo3::ffi::c_str; use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; @@ -86,10 +97,12 @@ mod tests { fn init_python() { prepare_freethreaded_python(); Python::with_gil(|py| { - if py.run_bound("import pyarrow", None, None).is_err() { - let locals = PyDict::new_bound(py); - py.run_bound( - "import sys; executable = sys.executable; python_path = sys.path", + if py.run(c_str!("import pyarrow"), None, None).is_err() { + let locals = PyDict::new(py); + py.run( + c_str!( + "import sys; executable = sys.executable; python_path = sys.path" + ), None, Some(&locals), ) @@ -135,17 +148,25 @@ mod tests { } #[test] - fn test_py_scalar() { + fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| { + Python::with_gil(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); - let py_float = scalar_float.into_py(py).call_method0(py, "as_py").unwrap(); + let py_float = scalar_float + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_float, "assert py_float == 12.34"); let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string())); - let py_string = scalar_string.into_py(py).call_method0(py, "as_py").unwrap(); + let py_string = scalar_string + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_string, "assert py_string == 'Hello!'"); - }); + + Ok(()) + }) } } From 00a66bdb1476f483ca96d17bc178c3d09b05ff3a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 14:06:42 -0500 Subject: [PATCH 4/7] Allow deprecated methods for dict_id --- datafusion/physical-plan/src/aggregates/mod.rs | 16 ++++------------ datafusion/proto-common/src/from_proto/mod.rs | 4 ++++ datafusion/proto-common/src/to_proto/mod.rs | 2 ++ .../proto/tests/cases/roundtrip_logical_plan.rs | 2 ++ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 2e0103defd9f..0ff8e37b9504 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -2438,25 +2438,21 @@ mod tests { "labels".to_string(), DataType::Struct( vec![ - Field::new_dict( + Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), - Field::new_dict( + Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), ] .into(), @@ -2468,15 +2464,13 @@ mod tests { vec![ Arc::new(StructArray::from(vec![ ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("a"), None, Some("a")] @@ -2485,15 +2479,13 @@ mod tests { ) as ArrayRef, ), ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("b"), Some("c"), Some("b")] diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index eb6976aa0c06..787a750493bf 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -321,6 +321,8 @@ impl TryFrom<&protobuf::Field> for Field { fn try_from(field: &protobuf::Field) -> Result { let datatype = field.arrow_type.as_deref().required("arrow_type")?; let field = if field.dict_id != 0 { + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Self::new_dict( field.name.as_str(), datatype, @@ -365,6 +367,8 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { .as_ref() .ok_or_else(|| Error::required("value"))?; + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Ok(match value { Value::BoolValue(v) => Self::Boolean(Some(*v)), Value::Utf8Value(v) => Self::Utf8(Some(v.to_owned())), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index a7cea607cb6d..15b261f9cf30 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -97,6 +97,8 @@ impl TryFrom<&Field> for protobuf::Field { nullable: field.is_nullable(), children: Vec::new(), metadata: field.metadata().clone(), + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] dict_id: field.dict_id().unwrap_or(0), dict_ordered: field.dict_is_ordered().unwrap_or(false), }) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index c0885ece08bc..a2d1a28bb8e1 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1777,6 +1777,8 @@ fn round_trip_datatype() { } } +// TODO file a ticket about handling deprecated dict_id attributes +#[allow(deprecated)] #[test] fn roundtrip_dict_id() -> Result<()> { let dict_id = 42; From 2fe785b7ad904e4f89142d4b545df98431a0b3fe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Dec 2024 14:03:29 -0500 Subject: [PATCH 5/7] remove unused max_statistics_size field --- datafusion/common/src/config.rs | 8 ---- .../common/src/file_options/parquet_writer.rs | 15 +----- .../proto/datafusion_common.proto | 8 ---- datafusion/proto-common/src/from_proto/mod.rs | 12 ----- .../proto-common/src/generated/pbjson.rs | 46 ------------------- .../proto-common/src/generated/prost.rs | 18 -------- datafusion/proto-common/src/to_proto/mod.rs | 6 --- .../src/generated/datafusion_proto_common.rs | 18 -------- .../proto/src/logical_plan/file_formats.rs | 14 ------ 9 files changed, 1 insertion(+), 144 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 6e64700bd2e0..ae8b9c8a1858 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -459,10 +459,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - pub max_statistics_size: Option, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -1653,10 +1649,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index dd9d67d6bb47..fab50750eb41 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -26,7 +26,7 @@ use parquet::{ basic::{BrotliLevel, GzipLevel, ZstdLevel}, file::properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, format::KeyValue, schema::types::ColumnPath, @@ -129,11 +129,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - if let Some(max_statistics_size) = options.max_statistics_size { - builder = - builder.set_column_max_statistics_size(path, max_statistics_size); - } } Ok(builder) @@ -154,7 +149,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -190,9 +184,6 @@ impl ParquetOptions { .and_then(|s| parse_statistics_string(s).ok()) .unwrap_or(DEFAULT_STATISTICS_ENABLED), ) - .set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) .set_max_row_group_size(*max_row_group_size) .set_created_by(created_by.clone()) .set_column_index_truncate_length(*column_index_truncate_length) @@ -395,7 +386,6 @@ mod tests { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -419,7 +409,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -473,7 +462,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -523,7 +511,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 69626f97fd80..fbe40f6b575f 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -473,10 +473,6 @@ message ParquetColumnOptions { oneof bloom_filter_ndv_opt { uint64 bloom_filter_ndv = 7; } - - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } } message ParquetOptions { @@ -514,10 +510,6 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } - oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; } diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 787a750493bf..6da341d86811 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -926,12 +926,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -986,12 +980,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index e88c1497af08..2b39065a7a65 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -4466,9 +4466,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4521,13 +4518,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4550,8 +4540,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4563,7 +4551,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4592,7 +4579,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4619,7 +4605,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4664,12 +4649,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4680,7 +4659,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -4964,9 +4942,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5081,15 +5056,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5176,8 +5142,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "encoding", @@ -5212,7 +5176,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, Encoding, BloomFilterFpp, @@ -5261,7 +5224,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "encoding" => Ok(GeneratedField::Encoding), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), @@ -5308,7 +5270,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut encoding_opt__ = None; let mut bloom_filter_fpp_opt__ = None; @@ -5467,12 +5428,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5523,7 +5478,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, encoding_opt: encoding_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 6b8509775847..1d2c44ce96d6 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -664,10 +664,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -706,11 +702,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -785,10 +776,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -823,11 +810,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 15b261f9cf30..bfa96696dfde 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -820,7 +820,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -857,11 +856,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index 6b8509775847..1d2c44ce96d6 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -664,10 +664,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -706,11 +702,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -785,10 +776,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -823,11 +810,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 62405b2fef21..5c02170e8f60 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -385,9 +385,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -436,9 +433,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -476,9 +470,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -529,11 +520,6 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } From 24fd642d7a5d5a57de09671fb3a7af97f09dc231 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 21:43:29 -0500 Subject: [PATCH 6/7] Update docs --- datafusion/sqllogictest/test_files/information_schema.slt | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 476a933c72b7..b25d78cc39ce 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -207,7 +207,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -300,7 +299,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer From 794eb3814e739b1447d5f06b127bc9f55ace8858 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 23:24:18 -0500 Subject: [PATCH 7/7] chore: remove max_statistics_size --- datafusion/sqllogictest/test_files/copy.slt | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index caa708483a11..498903927e7a 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -320,7 +320,6 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, -'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value'