From 5ba634aa4f6d3d4ed5eefbc15dba5448f4f30923 Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Fri, 12 Jul 2024 14:43:49 +0800 Subject: [PATCH 01/18] Implement ScalarFunction `MAKE_MAP` and `MAP` (#11361) * tmp * opt * modify test * add another version * implement make_map function * implement make_map function * implement map function * format and modify the doc * add benchmark for map function * add empty end-line * fix cargo check * update lock * upate lock * fix clippy * fmt and clippy * support FixedSizeList and LargeList * check type and handle null array in coerce_types * make array value throw todo error * fix clippy * simpify the error tests --- datafusion-cli/Cargo.lock | 1 + datafusion/functions/Cargo.toml | 7 +- datafusion/functions/benches/map.rs | 101 +++++++ datafusion/functions/src/core/map.rs | 312 +++++++++++++++++++++ datafusion/functions/src/core/mod.rs | 13 + datafusion/sqllogictest/test_files/map.slt | 112 ++++++++ 6 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 datafusion/functions/benches/map.rs create mode 100644 datafusion/functions/src/core/map.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 8af42cb43932..7da9cc427c37 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1278,6 +1278,7 @@ name = "datafusion-functions" version = "40.0.0" dependencies = [ "arrow", + "arrow-buffer", "base64 0.22.1", "blake2", "blake3", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 884a66724c91..b143080b1962 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -66,6 +66,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-buffer = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } @@ -86,7 +87,6 @@ uuid = { version = "1.7", features = ["v4"], optional = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } -arrow-buffer = { workspace = true } criterion = "0.5" rand = { workspace = true } rstest = { workspace = true } @@ -141,3 +141,8 @@ required-features = ["string_expressions"] harness = false name = "upper" required-features = ["string_expressions"] + +[[bench]] +harness = false +name = "map" +required-features = ["core_expressions"] diff --git a/datafusion/functions/benches/map.rs b/datafusion/functions/benches/map.rs new file mode 100644 index 000000000000..cd863d0e3311 --- /dev/null +++ b/datafusion/functions/benches/map.rs @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{Int32Array, ListArray, StringArray}; +use arrow::datatypes::{DataType, Field}; +use arrow_buffer::{OffsetBuffer, ScalarBuffer}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ScalarValue; +use datafusion_expr::ColumnarValue; +use datafusion_functions::core::{make_map, map}; +use rand::prelude::ThreadRng; +use rand::Rng; +use std::sync::Arc; + +fn keys(rng: &mut ThreadRng) -> Vec { + let mut keys = vec![]; + for _ in 0..1000 { + keys.push(rng.gen_range(0..9999).to_string()); + } + keys +} + +fn values(rng: &mut ThreadRng) -> Vec { + let mut values = vec![]; + for _ in 0..1000 { + values.push(rng.gen_range(0..9999)); + } + values +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("make_map_1000", |b| { + let mut rng = rand::thread_rng(); + let keys = keys(&mut rng); + let values = values(&mut rng); + let mut buffer = Vec::new(); + for i in 0..1000 { + buffer.push(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + keys[i].clone(), + )))); + buffer.push(ColumnarValue::Scalar(ScalarValue::Int32(Some(values[i])))); + } + + b.iter(|| { + black_box( + make_map() + .invoke(&buffer) + .expect("map should work on valid values"), + ); + }); + }); + + c.bench_function("map_1000", |b| { + let mut rng = rand::thread_rng(); + let field = Arc::new(Field::new("item", DataType::Utf8, true)); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1000])); + let key_list = ListArray::new( + field, + offsets, + Arc::new(StringArray::from(keys(&mut rng))), + None, + ); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1000])); + let value_list = ListArray::new( + field, + offsets, + Arc::new(Int32Array::from(values(&mut rng))), + None, + ); + let keys = ColumnarValue::Scalar(ScalarValue::List(Arc::new(key_list))); + let values = ColumnarValue::Scalar(ScalarValue::List(Arc::new(value_list))); + + b.iter(|| { + black_box( + map() + .invoke(&[keys.clone(), values.clone()]) + .expect("map should work on valid values"), + ); + }); + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/core/map.rs b/datafusion/functions/src/core/map.rs new file mode 100644 index 000000000000..8a8a19d7af52 --- /dev/null +++ b/datafusion/functions/src/core/map.rs @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use arrow::array::{Array, ArrayData, ArrayRef, MapArray, StructArray}; +use arrow::compute::concat; +use arrow::datatypes::{DataType, Field, SchemaBuilder}; +use arrow_buffer::{Buffer, ToByteSlice}; + +use datafusion_common::{exec_err, internal_err, ScalarValue}; +use datafusion_common::{not_impl_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +fn make_map(args: &[ColumnarValue]) -> Result { + let (key, value): (Vec<_>, Vec<_>) = args + .chunks_exact(2) + .map(|chunk| { + if let ColumnarValue::Array(_) = chunk[0] { + return not_impl_err!("make_map does not support array keys"); + } + if let ColumnarValue::Array(_) = chunk[1] { + return not_impl_err!("make_map does not support array values"); + } + Ok((chunk[0].clone(), chunk[1].clone())) + }) + .collect::>>()? + .into_iter() + .unzip(); + + let keys = ColumnarValue::values_to_arrays(&key)?; + let values = ColumnarValue::values_to_arrays(&value)?; + + let keys: Vec<_> = keys.iter().map(|k| k.as_ref()).collect(); + let values: Vec<_> = values.iter().map(|v| v.as_ref()).collect(); + + let key = match concat(&keys) { + Ok(key) => key, + Err(e) => return internal_err!("Error concatenating keys: {}", e), + }; + let value = match concat(&values) { + Ok(value) => value, + Err(e) => return internal_err!("Error concatenating values: {}", e), + }; + make_map_batch_internal(key, value) +} + +fn make_map_batch(args: &[ColumnarValue]) -> Result { + if args.len() != 2 { + return exec_err!( + "make_map requires exactly 2 arguments, got {} instead", + args.len() + ); + } + let key = get_first_array_ref(&args[0])?; + let value = get_first_array_ref(&args[1])?; + make_map_batch_internal(key, value) +} + +fn get_first_array_ref(columnar_value: &ColumnarValue) -> Result { + match columnar_value { + ColumnarValue::Scalar(value) => match value { + ScalarValue::List(array) => Ok(array.value(0).clone()), + ScalarValue::LargeList(array) => Ok(array.value(0).clone()), + ScalarValue::FixedSizeList(array) => Ok(array.value(0).clone()), + _ => exec_err!("Expected array, got {:?}", value), + }, + ColumnarValue::Array(array) => exec_err!("Expected scalar, got {:?}", array), + } +} + +fn make_map_batch_internal(keys: ArrayRef, values: ArrayRef) -> Result { + if keys.null_count() > 0 { + return exec_err!("map key cannot be null"); + } + + if keys.len() != values.len() { + return exec_err!("map requires key and value lists to have the same length"); + } + + let key_field = Arc::new(Field::new("key", keys.data_type().clone(), false)); + let value_field = Arc::new(Field::new("value", values.data_type().clone(), true)); + let mut entry_struct_buffer: VecDeque<(Arc, ArrayRef)> = VecDeque::new(); + let mut entry_offsets_buffer = VecDeque::new(); + entry_offsets_buffer.push_back(0); + + entry_struct_buffer.push_back((Arc::clone(&key_field), Arc::clone(&keys))); + entry_struct_buffer.push_back((Arc::clone(&value_field), Arc::clone(&values))); + entry_offsets_buffer.push_back(keys.len() as u32); + + let entry_struct: Vec<(Arc, ArrayRef)> = entry_struct_buffer.into(); + let entry_struct = StructArray::from(entry_struct); + + let map_data_type = DataType::Map( + Arc::new(Field::new( + "entries", + entry_struct.data_type().clone(), + false, + )), + false, + ); + + let entry_offsets: Vec = entry_offsets_buffer.into(); + let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); + + let map_data = ArrayData::builder(map_data_type) + .len(entry_offsets.len() - 1) + .add_buffer(entry_offsets_buffer) + .add_child_data(entry_struct.to_data()) + .build()?; + + Ok(ColumnarValue::Array(Arc::new(MapArray::from(map_data)))) +} + +#[derive(Debug)] +pub struct MakeMap { + signature: Signature, +} + +impl Default for MakeMap { + fn default() -> Self { + Self::new() + } +} + +impl MakeMap { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MakeMap { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_map" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + if arg_types.is_empty() { + return exec_err!( + "make_map requires at least one pair of arguments, got 0 instead" + ); + } + if arg_types.len() % 2 != 0 { + return exec_err!( + "make_map requires an even number of arguments, got {} instead", + arg_types.len() + ); + } + + let key_type = &arg_types[0]; + let mut value_type = &arg_types[1]; + + for (i, chunk) in arg_types.chunks_exact(2).enumerate() { + if chunk[0].is_null() { + return exec_err!("make_map key cannot be null at position {}", i); + } + if &chunk[0] != key_type { + return exec_err!( + "make_map requires all keys to have the same type {}, got {} instead at position {}", + key_type, + chunk[0], + i + ); + } + + if !chunk[1].is_null() { + if value_type.is_null() { + value_type = &chunk[1]; + } else if &chunk[1] != value_type { + return exec_err!( + "map requires all values to have the same type {}, got {} instead at position {}", + value_type, + &chunk[1], + i + ); + } + } + } + + let mut result = Vec::new(); + for _ in 0..arg_types.len() / 2 { + result.push(key_type.clone()); + result.push(value_type.clone()); + } + + Ok(result) + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let key_type = &arg_types[0]; + let mut value_type = &arg_types[1]; + + for chunk in arg_types.chunks_exact(2) { + if !chunk[1].is_null() && value_type.is_null() { + value_type = &chunk[1]; + } + } + + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("key", key_type.clone(), false)); + builder.push(Field::new("value", value_type.clone(), true)); + let fields = builder.finish().fields; + Ok(DataType::Map( + Arc::new(Field::new("entries", DataType::Struct(fields), false)), + false, + )) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_map(args) + } +} + +#[derive(Debug)] +pub struct MapFunc { + signature: Signature, +} + +impl Default for MapFunc { + fn default() -> Self { + Self::new() + } +} + +impl MapFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MapFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "map" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.len() % 2 != 0 { + return exec_err!( + "map requires an even number of arguments, got {} instead", + arg_types.len() + ); + } + let mut builder = SchemaBuilder::new(); + builder.push(Field::new( + "key", + get_element_type(&arg_types[0])?.clone(), + false, + )); + builder.push(Field::new( + "value", + get_element_type(&arg_types[1])?.clone(), + true, + )); + let fields = builder.finish().fields; + Ok(DataType::Map( + Arc::new(Field::new("entries", DataType::Struct(fields), false)), + false, + )) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_map_batch(args) + } +} + +fn get_element_type(data_type: &DataType) -> Result<&DataType> { + match data_type { + DataType::List(element) => Ok(element.data_type()), + DataType::LargeList(element) => Ok(element.data_type()), + DataType::FixedSizeList(element, _) => Ok(element.data_type()), + _ => exec_err!( + "Expected list, large_list or fixed_size_list, got {:?}", + data_type + ), + } +} diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 062a4a104d54..31bce04beec1 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -25,6 +25,7 @@ pub mod arrowtypeof; pub mod coalesce; pub mod expr_ext; pub mod getfield; +pub mod map; pub mod named_struct; pub mod nullif; pub mod nvl; @@ -42,6 +43,8 @@ make_udf_function!(r#struct::StructFunc, STRUCT, r#struct); make_udf_function!(named_struct::NamedStructFunc, NAMED_STRUCT, named_struct); make_udf_function!(getfield::GetFieldFunc, GET_FIELD, get_field); make_udf_function!(coalesce::CoalesceFunc, COALESCE, coalesce); +make_udf_function!(map::MakeMap, MAKE_MAP, make_map); +make_udf_function!(map::MapFunc, MAP, map); pub mod expr_fn { use datafusion_expr::{Expr, Literal}; @@ -78,6 +81,14 @@ pub mod expr_fn { coalesce, "Returns `coalesce(args...)`, which evaluates to the value of the first expr which is not NULL", args, + ),( + make_map, + "Returns a map created from the given keys and values pairs. This function isn't efficient for large maps. Use the `map` function instead.", + args, + ),( + map, + "Returns a map created from a key list and a value list", + args, )); #[doc = "Returns the value of the field with the given name from the struct"] @@ -96,5 +107,7 @@ pub fn functions() -> Vec> { named_struct(), get_field(), coalesce(), + make_map(), + map(), ] } diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 417947dc6c89..abf5b2ebbf98 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -100,3 +100,115 @@ physical_plan statement ok drop table table_with_map; + +query ? +SELECT MAKE_MAP('POST', 41, 'HEAD', 33, 'PATCH', 30, 'OPTION', 29, 'GET', 27, 'PUT', 25, 'DELETE', 24) AS method_count; +---- +{POST: 41, HEAD: 33, PATCH: 30, OPTION: 29, GET: 27, PUT: 25, DELETE: 24} + +query I +SELECT MAKE_MAP('POST', 41, 'HEAD', 33)['POST']; +---- +41 + +query ? +SELECT MAKE_MAP('POST', 41, 'HEAD', 33, 'PATCH', null); +---- +{POST: 41, HEAD: 33, PATCH: } + +query ? +SELECT MAKE_MAP('POST', null, 'HEAD', 33, 'PATCH', null); +---- +{POST: , HEAD: 33, PATCH: } + +query ? +SELECT MAKE_MAP(1, null, 2, 33, 3, null); +---- +{1: , 2: 33, 3: } + +query ? +SELECT MAKE_MAP([1,2], ['a', 'b'], [3,4], ['b']); +---- +{[1, 2]: [a, b], [3, 4]: [b]} + +query error +SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); + +query error +SELECT MAKE_MAP('POST', 41, 'HEAD', 33, null, 30); + +query error +SELECT MAKE_MAP('POST', 41, 123, 33,'PATCH', 30); + +query error +SELECT MAKE_MAP() + +query error +SELECT MAKE_MAP('POST', 41, 'HEAD'); + +query ? +SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, 30]); +---- +{POST: 41, HEAD: 33, PATCH: 30} + +query ? +SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); +---- +{POST: 41, HEAD: 33, PATCH: } + +query ? +SELECT MAP([[1,2], [3,4]], ['a', 'b']); +---- +{[1, 2]: a, [3, 4]: b} + +query error +SELECT MAP() + +query error DataFusion error: Execution error: map requires an even number of arguments, got 1 instead +SELECT MAP(['POST', 'HEAD']) + +query error DataFusion error: Execution error: Expected list, large_list or fixed_size_list, got Null +SELECT MAP(null, [41, 33, 30]); + +query error DataFusion error: Execution error: map requires key and value lists to have the same length +SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33]); + +query error DataFusion error: Execution error: map key cannot be null +SELECT MAP(['POST', 'HEAD', null], [41, 33, 30]); + +query ? +SELECT MAP(make_array('POST', 'HEAD', 'PATCH'), make_array(41, 33, 30)); +---- +{POST: 41, HEAD: 33, PATCH: 30} + +query ? +SELECT MAP(arrow_cast(make_array('POST', 'HEAD', 'PATCH'), 'FixedSizeList(3, Utf8)'), arrow_cast(make_array(41, 33, 30), 'FixedSizeList(3, Int64)')); +---- +{POST: 41, HEAD: 33, PATCH: 30} + +query ? +SELECT MAP(arrow_cast(make_array('POST', 'HEAD', 'PATCH'), 'LargeList(Utf8)'), arrow_cast(make_array(41, 33, 30), 'LargeList(Int64)')); +---- +{POST: 41, HEAD: 33, PATCH: 30} + +statement ok +create table t as values +('a', 1, 'k1', 10, ['k1', 'k2'], [1, 2]), +('b', 2, 'k3', 30, ['k3'], [3]), +('d', 4, 'k5', 50, ['k5'], [5]); + +query error +SELECT make_map(column1, column2, column3, column4) FROM t; +# TODO: support array value +# ---- +# {a: 1, k1: 10} +# {b: 2, k3: 30} +# {d: 4, k5: 50} + +query error +SELECT map(column5, column6) FROM t; +# TODO: support array value +# ---- +# {k1:1, k2:2} +# {k3: 3} +# {k5: 5} From d542cbda8f17ba004de18bb107ecf1c8ec3266f6 Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Fri, 12 Jul 2024 12:53:05 +0200 Subject: [PATCH 02/18] Improve `CommonSubexprEliminate` rule with surely and conditionally evaluated stats (#11357) * Improve `CommonSubexprEliminate` rule with surely and conditionally evaluated stats * remove expression tree hashing as no longer needed * address review comments * add negative tests --- datafusion/expr/src/expr.rs | 39 ++- .../optimizer/src/common_subexpr_eliminate.rs | 256 +++++++++++------- .../optimizer/src/optimize_projections/mod.rs | 10 +- datafusion/sqllogictest/test_files/cse.slt | 88 +++++- datafusion/sqllogictest/test_files/select.slt | 20 +- .../sqllogictest/test_files/tpch/q14.slt.part | 33 +-- 6 files changed, 298 insertions(+), 148 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ecece6dbfce7..a344e621ddb1 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -17,7 +17,7 @@ //! Logical Expressions: [`Expr`] -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::{self, Display, Formatter, Write}; use std::hash::{Hash, Hasher}; use std::mem; @@ -1380,7 +1380,7 @@ impl Expr { /// // refs contains "a" and "b" /// assert_eq!(refs.len(), 2); /// assert!(refs.contains(&Column::new_unqualified("a"))); - /// assert!(refs.contains(&Column::new_unqualified("b"))); + /// assert!(refs.contains(&Column::new_unqualified("b"))); /// ``` pub fn column_refs(&self) -> HashSet<&Column> { let mut using_columns = HashSet::new(); @@ -1401,6 +1401,41 @@ impl Expr { .expect("traversal is infallable"); } + /// Return all references to columns and their occurrence counts in the expression. + /// + /// # Example + /// ``` + /// # use std::collections::HashMap; + /// # use datafusion_common::Column; + /// # use datafusion_expr::col; + /// // For an expression `a + (b * a)` + /// let expr = col("a") + (col("b") * col("a")); + /// let mut refs = expr.column_refs_counts(); + /// // refs contains "a" and "b" + /// assert_eq!(refs.len(), 2); + /// assert_eq!(*refs.get(&Column::new_unqualified("a")).unwrap(), 2); + /// assert_eq!(*refs.get(&Column::new_unqualified("b")).unwrap(), 1); + /// ``` + pub fn column_refs_counts(&self) -> HashMap<&Column, usize> { + let mut map = HashMap::new(); + self.add_column_ref_counts(&mut map); + map + } + + /// Adds references to all columns and their occurrence counts in the expression to + /// the map. + /// + /// See [`Self::column_refs_counts`] for details + pub fn add_column_ref_counts<'a>(&'a self, map: &mut HashMap<&'a Column, usize>) { + self.apply(|expr| { + if let Expr::Column(col) = expr { + *map.entry(col).or_default() += 1; + } + Ok(TreeNodeRecursion::Continue) + }) + .expect("traversal is infallable"); + } + /// Returns true if there are any column references in this Expr pub fn any_column_refs(&self) -> bool { self.exists(|expr| Ok(matches!(expr, Expr::Column(_)))) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 721987b917d4..e4b36652974d 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -33,12 +33,12 @@ use datafusion_common::tree_node::{ use datafusion_common::{ internal_datafusion_err, qualified_name, Column, DFSchema, DFSchemaRef, Result, }; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{ Aggregate, Filter, LogicalPlan, Projection, Sort, Window, }; -use datafusion_expr::{col, Expr, ExprSchemable}; +use datafusion_expr::{col, BinaryExpr, Case, Expr, ExprSchemable, Operator}; use indexmap::IndexMap; const CSE_PREFIX: &str = "__common_expr"; @@ -56,13 +56,9 @@ struct Identifier<'n> { } impl<'n> Identifier<'n> { - fn new(expr: &'n Expr, is_tree: bool, random_state: &RandomState) -> Self { + fn new(expr: &'n Expr, random_state: &RandomState) -> Self { let mut hasher = random_state.build_hasher(); - if is_tree { - expr.hash(&mut hasher); - } else { - expr.hash_node(&mut hasher); - } + expr.hash_node(&mut hasher); let hash = hasher.finish(); Self { hash, expr } } @@ -110,8 +106,9 @@ impl Hash for Identifier<'_> { /// ``` type IdArray<'n> = Vec<(usize, Option>)>; -/// A map that contains the number of occurrences of expressions by their identifiers. -type ExprStats<'n> = HashMap, usize>; +/// A map that contains the number of normal and conditional occurrences of expressions by +/// their identifiers. +type ExprStats<'n> = HashMap, (usize, usize)>; /// A map that contains the common expressions and their alias extracted during the /// second, rewriting traversal. @@ -200,6 +197,7 @@ impl CommonSubexprEliminate { expr_mask, random_state: &self.random_state, found_common: false, + conditional: false, }; expr.visit(&mut visitor)?; @@ -901,15 +899,17 @@ struct ExprIdentifierVisitor<'a, 'n> { random_state: &'a RandomState, // a flag to indicate that common expression found found_common: bool, + // if we are in a conditional branch. A conditional branch means that the expression + // might not be executed depending on the runtime values of other expressions, and + // thus can not be extracted as a common expression. + conditional: bool, } /// Record item that used when traversing an expression tree. enum VisitRecord<'n> { /// Marks the beginning of expression. It contains: /// - The post-order index assigned during the first, visiting traversal. - /// - A boolean flag if the record marks an expression subtree (not just a single - /// node). - EnterMark(usize, bool), + EnterMark(usize), /// Marks an accumulated subexpression tree. It contains: /// - The accumulated identifier of a subexpression. @@ -924,10 +924,6 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { /// Find the first `EnterMark` in the stack, and accumulates every `ExprItem` before /// it. Returns a tuple that contains: /// - The pre-order index of the expression we marked. - /// - A boolean flag if we marked an expression subtree (not just a single node). - /// If true we didn't recurse into the node's children, so we need to calculate the - /// hash of the marked expression tree (not just the node) and we need to validate - /// the expression tree (not just the node). /// - The accumulated identifier of the children of the marked expression. /// - An accumulated boolean flag from the children of the marked expression if all /// children are valid for subexpression elimination (i.e. it is safe to extract the @@ -937,14 +933,14 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { /// information up from children to parents via `visit_stack` during the first, /// visiting traversal and no need to test the expression's validity beforehand with /// an extra traversal). - fn pop_enter_mark(&mut self) -> (usize, bool, Option>, bool) { + fn pop_enter_mark(&mut self) -> (usize, Option>, bool) { let mut expr_id = None; let mut is_valid = true; while let Some(item) = self.visit_stack.pop() { match item { - VisitRecord::EnterMark(down_index, is_tree) => { - return (down_index, is_tree, expr_id, is_valid); + VisitRecord::EnterMark(down_index) => { + return (down_index, expr_id, is_valid); } VisitRecord::ExprItem(sub_expr_id, sub_expr_is_valid) => { expr_id = Some(sub_expr_id.combine(expr_id)); @@ -954,53 +950,112 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { } unreachable!("Enter mark should paired with node number"); } + + /// Save the current `conditional` status and run `f` with `conditional` set to true. + fn conditionally Result<()>>( + &mut self, + mut f: F, + ) -> Result<()> { + let conditional = self.conditional; + self.conditional = true; + f(self)?; + self.conditional = conditional; + + Ok(()) + } } impl<'n> TreeNodeVisitor<'n> for ExprIdentifierVisitor<'_, 'n> { type Node = Expr; fn f_down(&mut self, expr: &'n Expr) -> Result { - // If an expression can short circuit its children then don't consider its - // children for CSE (https://github.com/apache/arrow-datafusion/issues/8814). - // This means that we don't recurse into its children, but handle the expression - // as a subtree when we calculate its identifier. - // TODO: consider surely executed children of "short circuited"s for CSE - let is_tree = expr.short_circuits(); - let tnr = if is_tree { - TreeNodeRecursion::Jump - } else { - TreeNodeRecursion::Continue - }; - self.id_array.push((0, None)); self.visit_stack - .push(VisitRecord::EnterMark(self.down_index, is_tree)); + .push(VisitRecord::EnterMark(self.down_index)); self.down_index += 1; - Ok(tnr) + // If an expression can short-circuit then some of its children might not be + // executed so count the occurrence of subexpressions as conditional in all + // children. + Ok(match expr { + // If we are already in a conditionally evaluated subtree then continue + // traversal. + _ if self.conditional => TreeNodeRecursion::Continue, + + // In case of `ScalarFunction`s we don't know which children are surely + // executed so start visiting all children conditionally and stop the + // recursion with `TreeNodeRecursion::Jump`. + Expr::ScalarFunction(ScalarFunction { func, args }) + if func.short_circuits() => + { + self.conditionally(|visitor| { + args.iter().try_for_each(|e| e.visit(visitor).map(|_| ())) + })?; + + TreeNodeRecursion::Jump + } + + // In case of `And` and `Or` the first child is surely executed, but we + // account subexpressions as conditional in the second. + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::And | Operator::Or, + right, + }) => { + left.visit(self)?; + self.conditionally(|visitor| right.visit(visitor).map(|_| ()))?; + + TreeNodeRecursion::Jump + } + + // In case of `Case` the optional base expression and the first when + // expressions are surely executed, but we account subexpressions as + // conditional in the others. + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { + expr.iter().try_for_each(|e| e.visit(self).map(|_| ()))?; + when_then_expr.iter().take(1).try_for_each(|(when, then)| { + when.visit(self)?; + self.conditionally(|visitor| then.visit(visitor).map(|_| ())) + })?; + self.conditionally(|visitor| { + when_then_expr.iter().skip(1).try_for_each(|(when, then)| { + when.visit(visitor)?; + then.visit(visitor).map(|_| ()) + })?; + else_expr + .iter() + .try_for_each(|e| e.visit(visitor).map(|_| ())) + })?; + + TreeNodeRecursion::Jump + } + + // In case of non-short-circuit expressions continue the traversal. + _ => TreeNodeRecursion::Continue, + }) } fn f_up(&mut self, expr: &'n Expr) -> Result { - let (down_index, is_tree, sub_expr_id, sub_expr_is_valid) = self.pop_enter_mark(); + let (down_index, sub_expr_id, sub_expr_is_valid) = self.pop_enter_mark(); - let (expr_id, is_valid) = if is_tree { - ( - Identifier::new(expr, true, self.random_state), - !expr.is_volatile()?, - ) - } else { - ( - Identifier::new(expr, false, self.random_state).combine(sub_expr_id), - !expr.is_volatile_node() && sub_expr_is_valid, - ) - }; + let expr_id = Identifier::new(expr, self.random_state).combine(sub_expr_id); + let is_valid = !expr.is_volatile_node() && sub_expr_is_valid; self.id_array[down_index].0 = self.up_index; if is_valid && !self.expr_mask.ignores(expr) { self.id_array[down_index].1 = Some(expr_id); - let count = self.expr_stats.entry(expr_id).or_insert(0); - *count += 1; - if *count > 1 { + let (count, conditional_count) = + self.expr_stats.entry(expr_id).or_insert((0, 0)); + if self.conditional { + *conditional_count += 1; + } else { + *count += 1; + } + if *count > 1 || (*count == 1 && *conditional_count > 0) { self.found_common = true; } } @@ -1039,51 +1094,40 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_, '_> { self.alias_counter += 1; } - // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate the - // `id_array`, which records the expr's identifier used to rewrite expr. So if we - // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too. - let is_tree = expr.short_circuits(); - let tnr = if is_tree { - TreeNodeRecursion::Jump - } else { - TreeNodeRecursion::Continue - }; - let (up_index, expr_id) = self.id_array[self.down_index]; self.down_index += 1; - // skip `Expr`s without identifier (empty identifier). - let Some(expr_id) = expr_id else { - return Ok(Transformed::new(expr, false, tnr)); - }; - - let count = self.expr_stats.get(&expr_id).unwrap(); - if *count > 1 { - // step index to skip all sub-node (which has smaller series number). - while self.down_index < self.id_array.len() - && self.id_array[self.down_index].0 < up_index - { - self.down_index += 1; - } + // Handle `Expr`s with identifiers only + if let Some(expr_id) = expr_id { + let (count, conditional_count) = self.expr_stats.get(&expr_id).unwrap(); + if *count > 1 || *count == 1 && *conditional_count > 0 { + // step index to skip all sub-node (which has smaller series number). + while self.down_index < self.id_array.len() + && self.id_array[self.down_index].0 < up_index + { + self.down_index += 1; + } - let expr_name = expr.display_name()?; - let (_, expr_alias) = self.common_exprs.entry(expr_id).or_insert_with(|| { - let expr_alias = self.alias_generator.next(CSE_PREFIX); - (expr, expr_alias) - }); + let expr_name = expr.display_name()?; + let (_, expr_alias) = + self.common_exprs.entry(expr_id).or_insert_with(|| { + let expr_alias = self.alias_generator.next(CSE_PREFIX); + (expr, expr_alias) + }); - // alias the expressions without an `Alias` ancestor node - let rewritten = if self.alias_counter > 0 { - col(expr_alias.clone()) - } else { - self.alias_counter += 1; - col(expr_alias.clone()).alias(expr_name) - }; + // alias the expressions without an `Alias` ancestor node + let rewritten = if self.alias_counter > 0 { + col(expr_alias.clone()) + } else { + self.alias_counter += 1; + col(expr_alias.clone()).alias(expr_name) + }; - Ok(Transformed::new(rewritten, true, TreeNodeRecursion::Jump)) - } else { - Ok(Transformed::new(expr, false, tnr)) + return Ok(Transformed::new(rewritten, true, TreeNodeRecursion::Jump)); + } } + + Ok(Transformed::no(expr)) } fn f_up(&mut self, expr: Expr) -> Result> { @@ -1685,7 +1729,7 @@ mod test { .unwrap(); let rule = CommonSubexprEliminate::new(); let optimized_plan = rule.rewrite(plan, &OptimizerContext::new()).unwrap(); - assert!(!optimized_plan.transformed); + assert!(optimized_plan.transformed); let optimized_plan = optimized_plan.data; let schema = optimized_plan.schema(); @@ -1837,22 +1881,29 @@ mod test { let table_scan = test_table_scan()?; let extracted_short_circuit = col("a").eq(lit(0)).or(col("b").eq(lit(0))); - let not_extracted_short_circuit_leg_1 = (col("a") + col("b")).eq(lit(0)); + let extracted_short_circuit_leg_1 = (col("a") + col("b")).eq(lit(0)); let not_extracted_short_circuit_leg_2 = (col("a") - col("b")).eq(lit(0)); + let extracted_short_circuit_leg_3 = (col("a") * col("b")).eq(lit(0)); let plan = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![ extracted_short_circuit.clone().alias("c1"), extracted_short_circuit.alias("c2"), - not_extracted_short_circuit_leg_1.clone().alias("c3"), - not_extracted_short_circuit_leg_2.clone().alias("c4"), - not_extracted_short_circuit_leg_1 - .or(not_extracted_short_circuit_leg_2) + extracted_short_circuit_leg_1 + .clone() + .or(not_extracted_short_circuit_leg_2.clone()) + .alias("c3"), + extracted_short_circuit_leg_1 + .and(not_extracted_short_circuit_leg_2) + .alias("c4"), + extracted_short_circuit_leg_3 + .clone() + .or(extracted_short_circuit_leg_3.clone()) .alias("c5"), ])? .build()?; - let expected = "Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, test.a + test.b = Int32(0) AS c3, test.a - test.b = Int32(0) AS c4, test.a + test.b = Int32(0) OR test.a - test.b = Int32(0) AS c5\ - \n Projection: test.a = Int32(0) OR test.b = Int32(0) AS __common_expr_1, test.a, test.b, test.c\ + let expected = "Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, __common_expr_2 OR test.a - test.b = Int32(0) AS c3, __common_expr_2 AND test.a - test.b = Int32(0) AS c4, __common_expr_3 OR __common_expr_3 AS c5\ + \n Projection: test.a = Int32(0) OR test.b = Int32(0) AS __common_expr_1, test.a + test.b = Int32(0) AS __common_expr_2, test.a * test.b = Int32(0) AS __common_expr_3, test.a, test.b, test.c\ \n TableScan: test"; assert_optimized_plan_eq(expected, plan, None); @@ -1888,10 +1939,12 @@ mod test { let table_scan = test_table_scan()?; let rand = rand_func().call(vec![]); - let not_extracted_volatile_short_circuit_2 = - rand.clone().eq(lit(0)).or(col("b").eq(lit(0))); + let extracted_short_circuit_leg_1 = col("a").eq(lit(0)); let not_extracted_volatile_short_circuit_1 = - col("a").eq(lit(0)).or(rand.eq(lit(0))); + extracted_short_circuit_leg_1.or(rand.clone().eq(lit(0))); + let not_extracted_short_circuit_leg_2 = col("b").eq(lit(0)); + let not_extracted_volatile_short_circuit_2 = + rand.eq(lit(0)).or(not_extracted_short_circuit_leg_2); let plan = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![ not_extracted_volatile_short_circuit_1.clone().alias("c1"), @@ -1901,10 +1954,11 @@ mod test { ])? .build()?; - let expected = "Projection: test.a = Int32(0) OR random() = Int32(0) AS c1, test.a = Int32(0) OR random() = Int32(0) AS c2, random() = Int32(0) OR test.b = Int32(0) AS c3, random() = Int32(0) OR test.b = Int32(0) AS c4\ - \n TableScan: test"; + let expected = "Projection: __common_expr_1 OR random() = Int32(0) AS c1, __common_expr_1 OR random() = Int32(0) AS c2, random() = Int32(0) OR test.b = Int32(0) AS c3, random() = Int32(0) OR test.b = Int32(0) AS c4\ + \n Projection: test.a = Int32(0) AS __common_expr_1, test.a, test.b, test.c\ + \n TableScan: test"; - assert_non_optimized_plan_eq(expected, plan, None); + assert_optimized_plan_eq(expected, plan, None); Ok(()) } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index cae2a7b2cad2..58c1ae297b02 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -19,7 +19,7 @@ mod required_indices; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use crate::optimizer::ApplyOrder; @@ -42,7 +42,6 @@ use datafusion_common::tree_node::{ Transformed, TreeNode, TreeNodeIterator, TreeNodeRecursion, }; use datafusion_expr::logical_plan::tree_node::unwrap_arc; -use hashbrown::HashMap; /// Optimizer rule to prune unnecessary columns from intermediate schemas /// inside the [`LogicalPlan`]. This rule: @@ -472,11 +471,8 @@ fn merge_consecutive_projections(proj: Projection) -> Result::new(); - for columns in expr.iter().map(|expr| expr.column_refs()) { - for col in columns.into_iter() { - *column_referral_map.entry(col).or_default() += 1; - } - } + expr.iter() + .for_each(|expr| expr.add_column_ref_counts(&mut column_referral_map)); // If an expression is non-trivial and appears more than once, do not merge // them as consecutive projections will benefit from a compute-once approach. diff --git a/datafusion/sqllogictest/test_files/cse.slt b/datafusion/sqllogictest/test_files/cse.slt index 3579c1c1635c..19b47fa50e41 100644 --- a/datafusion/sqllogictest/test_files/cse.slt +++ b/datafusion/sqllogictest/test_files/cse.slt @@ -93,15 +93,16 @@ FROM t1 ---- logical_plan 01)Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, __common_expr_2 AS c3, __common_expr_2 AS c4, __common_expr_3 AS c5, __common_expr_3 AS c6 -02)--Projection: t1.a = Float64(0) AND t1.b = Float64(0) AS __common_expr_1, t1.a = Float64(0) OR t1.b = Float64(0) AS __common_expr_2, CASE WHEN t1.a = Float64(0) THEN Int64(0) ELSE Int64(1) END AS __common_expr_3 -03)----TableScan: t1 projection=[a, b] +02)--Projection: __common_expr_4 AND t1.b = Float64(0) AS __common_expr_1, __common_expr_4 OR t1.b = Float64(0) AS __common_expr_2, CASE WHEN __common_expr_4 THEN Int64(0) ELSE Int64(1) END AS __common_expr_3 +03)----Projection: t1.a = Float64(0) AS __common_expr_4, t1.b +04)------TableScan: t1 projection=[a, b] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 as c1, __common_expr_1@0 as c2, __common_expr_2@1 as c3, __common_expr_2@1 as c4, __common_expr_3@2 as c5, __common_expr_3@2 as c6] -02)--ProjectionExec: expr=[a@0 = 0 AND b@1 = 0 as __common_expr_1, a@0 = 0 OR b@1 = 0 as __common_expr_2, CASE WHEN a@0 = 0 THEN 0 ELSE 1 END as __common_expr_3] -03)----MemoryExec: partitions=1, partition_sizes=[0] +02)--ProjectionExec: expr=[__common_expr_4@0 AND b@1 = 0 as __common_expr_1, __common_expr_4@0 OR b@1 = 0 as __common_expr_2, CASE WHEN __common_expr_4@0 THEN 0 ELSE 1 END as __common_expr_3] +03)----ProjectionExec: expr=[a@0 = 0 as __common_expr_4, b@1 as b] +04)------MemoryExec: partitions=1, partition_sizes=[0] # Common children of short-circuit expression -# TODO: consider surely executed children of "short circuited"s for CSE. i.e. `a = 0`, `a = 2`, `a = 4` should be extracted query TT EXPLAIN SELECT a = 0 AND b = 0 AS c1, @@ -121,14 +122,15 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: t1.a = Float64(0) AND t1.b = Float64(0) AS c1, t1.a = Float64(0) AND t1.b = Float64(1) AS c2, t1.b = Float64(2) AND t1.a = Float64(1) AS c3, t1.b = Float64(3) AND t1.a = Float64(1) AS c4, t1.a = Float64(2) OR t1.b = Float64(4) AS c5, t1.a = Float64(2) OR t1.b = Float64(5) AS c6, t1.b = Float64(6) OR t1.a = Float64(3) AS c7, t1.b = Float64(7) OR t1.a = Float64(3) AS c8, CASE WHEN t1.a = Float64(4) THEN Int64(0) ELSE Int64(1) END AS c9, CASE WHEN t1.a = Float64(4) THEN Int64(0) ELSE Int64(2) END AS c10, CASE WHEN t1.b = Float64(8) THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 -02)--TableScan: t1 projection=[a, b] +01)Projection: __common_expr_1 AND t1.b = Float64(0) AS c1, __common_expr_1 AND t1.b = Float64(1) AS c2, t1.b = Float64(2) AND t1.a = Float64(1) AS c3, t1.b = Float64(3) AND t1.a = Float64(1) AS c4, __common_expr_2 OR t1.b = Float64(4) AS c5, __common_expr_2 OR t1.b = Float64(5) AS c6, t1.b = Float64(6) OR t1.a = Float64(3) AS c7, t1.b = Float64(7) OR t1.a = Float64(3) AS c8, CASE WHEN __common_expr_3 THEN Int64(0) ELSE Int64(1) END AS c9, CASE WHEN __common_expr_3 THEN Int64(0) ELSE Int64(2) END AS c10, CASE WHEN t1.b = Float64(8) THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 +02)--Projection: t1.a = Float64(0) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a = Float64(4) AS __common_expr_3, t1.a, t1.b +03)----TableScan: t1 projection=[a, b] physical_plan -01)ProjectionExec: expr=[a@0 = 0 AND b@1 = 0 as c1, a@0 = 0 AND b@1 = 1 as c2, b@1 = 2 AND a@0 = 1 as c3, b@1 = 3 AND a@0 = 1 as c4, a@0 = 2 OR b@1 = 4 as c5, a@0 = 2 OR b@1 = 5 as c6, b@1 = 6 OR a@0 = 3 as c7, b@1 = 7 OR a@0 = 3 as c8, CASE WHEN a@0 = 4 THEN 0 ELSE 1 END as c9, CASE WHEN a@0 = 4 THEN 0 ELSE 2 END as c10, CASE WHEN b@1 = 8 THEN a@0 + 1 ELSE 0 END as c11, CASE WHEN b@1 = 9 THEN a@0 + 1 ELSE 0 END as c12, CASE WHEN b@1 = 10 THEN 0 ELSE a@0 + 2 END as c13, CASE WHEN b@1 = 11 THEN 0 ELSE a@0 + 2 END as c14] -02)--MemoryExec: partitions=1, partition_sizes=[0] +01)ProjectionExec: expr=[__common_expr_1@0 AND b@4 = 0 as c1, __common_expr_1@0 AND b@4 = 1 as c2, b@4 = 2 AND a@3 = 1 as c3, b@4 = 3 AND a@3 = 1 as c4, __common_expr_2@1 OR b@4 = 4 as c5, __common_expr_2@1 OR b@4 = 5 as c6, b@4 = 6 OR a@3 = 3 as c7, b@4 = 7 OR a@3 = 3 as c8, CASE WHEN __common_expr_3@2 THEN 0 ELSE 1 END as c9, CASE WHEN __common_expr_3@2 THEN 0 ELSE 2 END as c10, CASE WHEN b@4 = 8 THEN a@3 + 1 ELSE 0 END as c11, CASE WHEN b@4 = 9 THEN a@3 + 1 ELSE 0 END as c12, CASE WHEN b@4 = 10 THEN 0 ELSE a@3 + 2 END as c13, CASE WHEN b@4 = 11 THEN 0 ELSE a@3 + 2 END as c14] +02)--ProjectionExec: expr=[a@0 = 0 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 = 4 as __common_expr_3, a@0 as a, b@1 as b] +03)----MemoryExec: partitions=1, partition_sizes=[0] # Common children of volatile, short-circuit expression -# TODO: consider surely executed children of "short circuited"s for CSE. i.e. `a = 0`, `a = 2`, `a = 4` should be extracted query TT EXPLAIN SELECT a = 0 AND b = random() AS c1, @@ -148,11 +150,13 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: t1.a = Float64(0) AND t1.b = random() AS c1, t1.a = Float64(0) AND t1.b = Float64(1) + random() AS c2, t1.b = Float64(2) + random() AND t1.a = Float64(1) AS c3, t1.b = Float64(3) + random() AND t1.a = Float64(1) AS c4, t1.a = Float64(2) OR t1.b = Float64(4) + random() AS c5, t1.a = Float64(2) OR t1.b = Float64(5) + random() AS c6, t1.b = Float64(6) + random() OR t1.a = Float64(3) AS c7, t1.b = Float64(7) + random() OR t1.a = Float64(3) AS c8, CASE WHEN t1.a = Float64(4) THEN random() ELSE Float64(1) END AS c9, CASE WHEN t1.a = Float64(4) THEN random() ELSE Float64(2) END AS c10, CASE WHEN t1.b = Float64(8) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 -02)--TableScan: t1 projection=[a, b] +01)Projection: __common_expr_1 AND t1.b = random() AS c1, __common_expr_1 AND t1.b = Float64(1) + random() AS c2, t1.b = Float64(2) + random() AND t1.a = Float64(1) AS c3, t1.b = Float64(3) + random() AND t1.a = Float64(1) AS c4, __common_expr_2 OR t1.b = Float64(4) + random() AS c5, __common_expr_2 OR t1.b = Float64(5) + random() AS c6, t1.b = Float64(6) + random() OR t1.a = Float64(3) AS c7, t1.b = Float64(7) + random() OR t1.a = Float64(3) AS c8, CASE WHEN __common_expr_3 THEN random() ELSE Float64(1) END AS c9, CASE WHEN __common_expr_3 THEN random() ELSE Float64(2) END AS c10, CASE WHEN t1.b = Float64(8) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 +02)--Projection: t1.a = Float64(0) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a = Float64(4) AS __common_expr_3, t1.a, t1.b +03)----TableScan: t1 projection=[a, b] physical_plan -01)ProjectionExec: expr=[a@0 = 0 AND b@1 = random() as c1, a@0 = 0 AND b@1 = 1 + random() as c2, b@1 = 2 + random() AND a@0 = 1 as c3, b@1 = 3 + random() AND a@0 = 1 as c4, a@0 = 2 OR b@1 = 4 + random() as c5, a@0 = 2 OR b@1 = 5 + random() as c6, b@1 = 6 + random() OR a@0 = 3 as c7, b@1 = 7 + random() OR a@0 = 3 as c8, CASE WHEN a@0 = 4 THEN random() ELSE 1 END as c9, CASE WHEN a@0 = 4 THEN random() ELSE 2 END as c10, CASE WHEN b@1 = 8 + random() THEN a@0 + 1 ELSE 0 END as c11, CASE WHEN b@1 = 9 + random() THEN a@0 + 1 ELSE 0 END as c12, CASE WHEN b@1 = 10 + random() THEN 0 ELSE a@0 + 2 END as c13, CASE WHEN b@1 = 11 + random() THEN 0 ELSE a@0 + 2 END as c14] -02)--MemoryExec: partitions=1, partition_sizes=[0] +01)ProjectionExec: expr=[__common_expr_1@0 AND b@4 = random() as c1, __common_expr_1@0 AND b@4 = 1 + random() as c2, b@4 = 2 + random() AND a@3 = 1 as c3, b@4 = 3 + random() AND a@3 = 1 as c4, __common_expr_2@1 OR b@4 = 4 + random() as c5, __common_expr_2@1 OR b@4 = 5 + random() as c6, b@4 = 6 + random() OR a@3 = 3 as c7, b@4 = 7 + random() OR a@3 = 3 as c8, CASE WHEN __common_expr_3@2 THEN random() ELSE 1 END as c9, CASE WHEN __common_expr_3@2 THEN random() ELSE 2 END as c10, CASE WHEN b@4 = 8 + random() THEN a@3 + 1 ELSE 0 END as c11, CASE WHEN b@4 = 9 + random() THEN a@3 + 1 ELSE 0 END as c12, CASE WHEN b@4 = 10 + random() THEN 0 ELSE a@3 + 2 END as c13, CASE WHEN b@4 = 11 + random() THEN 0 ELSE a@3 + 2 END as c14] +02)--ProjectionExec: expr=[a@0 = 0 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 = 4 as __common_expr_3, a@0 as a, b@1 as b] +03)----MemoryExec: partitions=1, partition_sizes=[0] # Common volatile children of short-circuit expression query TT @@ -171,3 +175,59 @@ logical_plan physical_plan 01)ProjectionExec: expr=[a@0 = random() AND b@1 = 0 as c1, a@0 = random() AND b@1 = 1 as c2, a@0 = 2 + random() OR b@1 = 4 as c3, a@0 = 2 + random() OR b@1 = 5 as c4, CASE WHEN a@0 = 4 + random() THEN 0 ELSE 1 END as c5, CASE WHEN a@0 = 4 + random() THEN 0 ELSE 2 END as c6] 02)--MemoryExec: partitions=1, partition_sizes=[0] + +# Surely only once but also conditionally evaluated expressions +query TT +EXPLAIN SELECT + (a = 1 OR random() = 0) AND a = 1 AS c1, + (a = 2 AND random() = 0) OR a = 2 AS c2, + CASE WHEN a + 3 = 0 THEN a + 3 ELSE 0 END AS c3, + CASE WHEN a + 4 = 0 THEN 0 WHEN a + 4 THEN 0 ELSE 0 END AS c4, + CASE WHEN a + 5 = 0 THEN 0 WHEN random() = 0 THEN a + 5 ELSE 0 END AS c5, + CASE WHEN a + 6 = 0 THEN 0 ELSE a + 6 END AS c6 +FROM t1 +---- +logical_plan +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_1 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 +02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4, t1.a + Float64(5) AS __common_expr_5, t1.a + Float64(6) AS __common_expr_6 +03)----TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_1@0 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] +02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4, a@0 + 5 as __common_expr_5, a@0 + 6 as __common_expr_6] +03)----MemoryExec: partitions=1, partition_sizes=[0] + +# Surely only once but also conditionally evaluated subexpressions +query TT +EXPLAIN SELECT + (a = 1 OR random() = 0) AND (a = 1 OR random() = 1) AS c1, + (a = 2 AND random() = 0) OR (a = 2 AND random() = 1) AS c2, + CASE WHEN a + 3 = 0 THEN a + 3 + random() ELSE 0 END AS c3, + CASE WHEN a + 4 = 0 THEN 0 ELSE a + 4 + random() END AS c4 +FROM t1 +---- +logical_plan +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND (__common_expr_1 OR random() = Float64(1)) AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AND random() = Float64(1) AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 + random() ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Float64(0) ELSE __common_expr_4 + random() END AS c4 +02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4 +03)----TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND (__common_expr_1@0 OR random() = 1) as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 AND random() = 1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 + random() ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 ELSE __common_expr_4@3 + random() END as c4] +02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4] +03)----MemoryExec: partitions=1, partition_sizes=[0] + +# Only conditionally evaluated expressions +query TT +EXPLAIN SELECT + (random() = 0 OR a = 1) AND a = 1 AS c1, + (random() = 0 AND a = 2) OR a = 2 AS c2, + CASE WHEN random() = 0 THEN a + 3 ELSE a + 3 END AS c3, + CASE WHEN random() = 0 THEN 0 WHEN a + 4 = 0 THEN a + 4 ELSE 0 END AS c4, + CASE WHEN random() = 0 THEN 0 WHEN a + 5 = 0 THEN 0 ELSE a + 5 END AS c5, + CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a + 6 ELSE a + 6 END AS c6 +FROM t1 +---- +logical_plan +01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(1) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(2) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 +02)--TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 1 as c1, random() = 0 AND a@0 = 2 OR a@0 = 2 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] +02)--MemoryExec: partitions=1, partition_sizes=[0] diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index f9baf8db69d5..95f67245a981 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1504,21 +1504,25 @@ query TT EXPLAIN SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x -02)--TableScan: t projection=[x, y] +01)Projection: __common_expr_1 AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND __common_expr_1 AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x +02)--Projection: t.y > Int32(0) AS __common_expr_1, t.x, t.y +03)----TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] -02)--MemoryExec: partitions=1, partition_sizes=[1] +01)ProjectionExec: expr=[__common_expr_1@0 AND 1 / CAST(y@2 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@1 > 0 AND __common_expr_1@0 AND 1 / CAST(y@2 AS Int64) < 1 / CAST(x@1 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] +02)--ProjectionExec: expr=[y@1 > 0 as __common_expr_1, x@0 as x, y@1 as y] +03)----MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y = 0 or 1 / y < 1, x = 0 or y = 0 or 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x -02)--TableScan: t projection=[x, y] +01)Projection: __common_expr_1 OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR __common_expr_1 OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x +02)--Projection: t.y = Int32(0) AS __common_expr_1, t.x, t.y +03)----TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] -02)--MemoryExec: partitions=1, partition_sizes=[1] +01)ProjectionExec: expr=[__common_expr_1@0 OR 1 / CAST(y@2 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@1 = 0 OR __common_expr_1@0 OR 1 / CAST(y@2 AS Int64) < 1 / CAST(x@1 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] +02)--ProjectionExec: expr=[y@1 = 0 as __common_expr_1, x@0 as x, y@1 as y] +03)----MemoryExec: partitions=1, partition_sizes=[1] # due to the reason describe in https://github.com/apache/datafusion/issues/8927, # the following queries will fail diff --git a/datafusion/sqllogictest/test_files/tpch/q14.slt.part b/datafusion/sqllogictest/test_files/tpch/q14.slt.part index e56e463a617d..3743c201ff2e 100644 --- a/datafusion/sqllogictest/test_files/tpch/q14.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q14.slt.part @@ -32,9 +32,9 @@ where and l_shipdate < date '1995-10-01'; ---- logical_plan -01)Projection: Float64(100) * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Float64) AS promo_revenue -02)--Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] -03)----Projection: lineitem.l_extendedprice, lineitem.l_discount, part.p_type +01)Projection: Float64(100) * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Float64) AS promo_revenue +02)--Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN __common_expr_1 ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] +03)----Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __common_expr_1, part.p_type 04)------Inner Join: lineitem.l_partkey = part.p_partkey 05)--------Projection: lineitem.l_partkey, lineitem.l_extendedprice, lineitem.l_discount 06)----------Filter: lineitem.l_shipdate >= Date32("1995-09-01") AND lineitem.l_shipdate < Date32("1995-10-01") @@ -44,19 +44,20 @@ physical_plan 01)ProjectionExec: expr=[100 * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END)@0 AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 AS Float64) as promo_revenue] 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] 03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4] -07)------------CoalesceBatchesExec: target_batch_size=8192 -08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -09)----------------ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] -10)------------------CoalesceBatchesExec: target_batch_size=8192 -11)--------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01 -12)----------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], has_header=false -13)------------CoalesceBatchesExec: target_batch_size=8192 -14)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -15)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -16)------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], has_header=false +04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +05)--------ProjectionExec: expr=[l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as __common_expr_1, p_type@2 as p_type] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4] +08)--------------CoalesceBatchesExec: target_batch_size=8192 +09)----------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 +10)------------------ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] +11)--------------------CoalesceBatchesExec: target_batch_size=8192 +12)----------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01 +13)------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], has_header=false +14)--------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +16)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +17)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], has_header=false From 1dfac86a89750193491cf3e04917e37b92c64ffa Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 12 Jul 2024 04:04:42 -0700 Subject: [PATCH 03/18] fix(11397): surface proper errors in ParquetSink (#11399) * fix(11397): do not surface errors for closed channels, and instead let the task join errors be surfaced * fix(11397): terminate early on channel send failure --- .../src/datasource/file_format/parquet.rs | 32 +++++++++---------- datafusion/core/tests/memory_limit/mod.rs | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 694c94928537..6271d8af3786 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -893,12 +893,12 @@ async fn send_arrays_to_col_writers( let mut next_channel = 0; for (array, field) in rb.columns().iter().zip(schema.fields()) { for c in compute_leaves(field, array)? { - col_array_channels[next_channel] - .send(c) - .await - .map_err(|_| { - DataFusionError::Internal("Unable to send array to writer!".into()) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if col_array_channels[next_channel].send(c).await.is_err() { + return Ok(()); + } + next_channel += 1; } } @@ -984,11 +984,11 @@ fn spawn_parquet_parallel_serialization_task( &pool, ); - serialize_tx.send(finalize_rg_task).await.map_err(|_| { - DataFusionError::Internal( - "Unable to send closed RG to concat task!".into(), - ) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if serialize_tx.send(finalize_rg_task).await.is_err() { + return Ok(()); + } current_rg_rows = 0; rb = rb.slice(rows_left, rb.num_rows() - rows_left); @@ -1013,11 +1013,11 @@ fn spawn_parquet_parallel_serialization_task( &pool, ); - serialize_tx.send(finalize_rg_task).await.map_err(|_| { - DataFusionError::Internal( - "Unable to send closed RG to concat task!".into(), - ) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if serialize_tx.send(finalize_rg_task).await.is_err() { + return Ok(()); + } } Ok(()) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index f7402357d1c7..7ef24609e238 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -340,8 +340,8 @@ async fn oom_parquet_sink() { path.to_string_lossy() )) .with_expected_errors(vec![ - // TODO: update error handling in ParquetSink - "Unable to send array to writer!", + "Failed to allocate additional", + "for ParquetSink(ArrowColumnWriter)", ]) .with_memory_limit(200_000) .run() From 13ddbaf2f7220c26f443d097697d1380e63f6206 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 10:53:58 -0400 Subject: [PATCH 04/18] Minor: Add note about SQLLancer fuzz testing to docs (#11430) * Minor: Add note about SQLLancer fuzz testing to docs * prettier --- docs/source/contributor-guide/testing.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md index 018cc6233c46..0f4461ab2c2c 100644 --- a/docs/source/contributor-guide/testing.md +++ b/docs/source/contributor-guide/testing.md @@ -39,7 +39,7 @@ DataFusion's SQL implementation is tested using [sqllogictest](https://github.co Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusion has chosen to trade off a slightly higher barrier to contribution for longer term maintainability. -### Rust Integration Tests +## Rust Integration Tests There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/datafusion/tree/main/datafusion/core/tests) directory. @@ -49,6 +49,18 @@ You can run these tests individually using `cargo` as normal command such as cargo test -p datafusion --test parquet_exec ``` +## SQL "Fuzz" testing + +DataFusion uses the [SQLancer] for "fuzz" testing: it generates random SQL +queries and execute them against DataFusion to find bugs. + +The code is in the [datafusion-sqllancer] repository, and we welcome further +contributions. Kudos to [@2010YOUY01] for the initial implementation. + +[sqlancer]: https://github.com/sqlancer/sqlancer +[datafusion-sqllancer]: https://github.com/datafusion-contrib/datafusion-sqllancer +[@2010youy01]: https://github.com/2010YOUY01 + ## Documentation Examples We use Rust [doctest] to verify examples from the documentation are correct and From c769a70dc1c746460b4c1369d4e42c4a78da9571 Mon Sep 17 00:00:00 2001 From: tmi Date: Fri, 12 Jul 2024 17:52:24 +0200 Subject: [PATCH 05/18] Trivial: use arrow csv writer's timestamp_tz_format (#11407) --- datafusion/common/src/file_options/csv_writer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index 5792cfdba9e0..ae069079a68f 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -63,6 +63,9 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { if let Some(v) = &value.timestamp_format { builder = builder.with_timestamp_format(v.into()) } + if let Some(v) = &value.timestamp_tz_format { + builder = builder.with_timestamp_tz_format(v.into()) + } if let Some(v) = &value.time_format { builder = builder.with_time_format(v.into()) } From a2a6458e420209c7125b08966c5726b5fd104195 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 11:53:03 -0400 Subject: [PATCH 06/18] Minor: improve documentation for sql unparsing (#11395) --- datafusion/sql/src/lib.rs | 6 ++- datafusion/sql/src/unparser/expr.rs | 29 +++++++++---- datafusion/sql/src/unparser/mod.rs | 64 +++++++++++++++++++++++++++-- datafusion/sql/src/unparser/plan.rs | 24 ++++++++--- 4 files changed, 105 insertions(+), 18 deletions(-) diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index eb5fec7a3c8b..f53cab5df848 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -17,7 +17,7 @@ // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] -//! This module provides: +//! This crate provides: //! //! 1. A SQL parser, [`DFParser`], that translates SQL query text into //! an abstract syntax tree (AST), [`Statement`]. @@ -25,10 +25,14 @@ //! 2. A SQL query planner [`SqlToRel`] that creates [`LogicalPlan`]s //! from [`Statement`]s. //! +//! 3. A SQL [`unparser`] that converts [`Expr`]s and [`LogicalPlan`]s +//! into SQL query text. +//! //! [`DFParser`]: parser::DFParser //! [`Statement`]: parser::Statement //! [`SqlToRel`]: planner::SqlToRel //! [`LogicalPlan`]: datafusion_expr::logical_plan::LogicalPlan +//! [`Expr`]: datafusion_expr::expr::Expr mod cte; mod expr; diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index e0d05c400cb0..eb149c819c8b 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -72,21 +72,34 @@ impl Display for Unparsed { } } -/// Convert a DataFusion [`Expr`] to `sqlparser::ast::Expr` +/// Convert a DataFusion [`Expr`] to [`ast::Expr`] /// -/// This function is the opposite of `SqlToRel::sql_to_expr` and can -/// be used to, among other things, convert [`Expr`]s to strings. -/// Throws an error if [`Expr`] can not be represented by an `sqlparser::ast::Expr` +/// This function is the opposite of [`SqlToRel::sql_to_expr`] and can be used +/// to, among other things, convert [`Expr`]s to SQL strings. Such strings could +/// be used to pass filters or other expressions to another SQL engine. +/// +/// # Errors +/// +/// Throws an error if [`Expr`] can not be represented by an [`ast::Expr`] +/// +/// # See Also +/// +/// * [`Unparser`] for more control over the conversion to SQL +/// * [`plan_to_sql`] for converting a [`LogicalPlan`] to SQL /// /// # Example /// ``` /// use datafusion_expr::{col, lit}; /// use datafusion_sql::unparser::expr_to_sql; -/// let expr = col("a").gt(lit(4)); -/// let sql = expr_to_sql(&expr).unwrap(); -/// -/// assert_eq!(format!("{}", sql), "(a > 4)") +/// let expr = col("a").gt(lit(4)); // form an expression `a > 4` +/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "(a > 4)") /// ``` +/// +/// [`SqlToRel::sql_to_expr`]: crate::planner::SqlToRel::sql_to_expr +/// [`plan_to_sql`]: crate::unparser::plan_to_sql +/// [`LogicalPlan`]: datafusion_expr::logical_plan::LogicalPlan pub fn expr_to_sql(expr: &Expr) -> Result { let unparser = Unparser::default(); unparser.expr_to_sql(expr) diff --git a/datafusion/sql/src/unparser/mod.rs b/datafusion/sql/src/unparser/mod.rs index e5ffbc8a212a..83ae64ba238b 100644 --- a/datafusion/sql/src/unparser/mod.rs +++ b/datafusion/sql/src/unparser/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! [`Unparser`] for converting `Expr` to SQL text + mod ast; mod expr; mod plan; @@ -27,6 +29,29 @@ pub use plan::plan_to_sql; use self::dialect::{DefaultDialect, Dialect}; pub mod dialect; +/// Convert a DataFusion [`Expr`] to [`sqlparser::ast::Expr`] +/// +/// See [`expr_to_sql`] for background. `Unparser` allows greater control of +/// the conversion, but with a more complicated API. +/// +/// To get more human-readable output, see [`Self::with_pretty`] +/// +/// # Example +/// ``` +/// use datafusion_expr::{col, lit}; +/// use datafusion_sql::unparser::Unparser; +/// let expr = col("a").gt(lit(4)); // form an expression `a > 4` +/// let unparser = Unparser::default(); +/// let sql = unparser.expr_to_sql(&expr).unwrap();// convert to AST +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "(a > 4)"); +/// // now convert to pretty sql +/// let unparser = unparser.with_pretty(true); +/// let sql = unparser.expr_to_sql(&expr).unwrap(); +/// assert_eq!(sql.to_string(), "a > 4"); // note lack of parenthesis +/// ``` +/// +/// [`Expr`]: datafusion_expr::Expr pub struct Unparser<'a> { dialect: &'a dyn Dialect, pretty: bool, @@ -40,9 +65,42 @@ impl<'a> Unparser<'a> { } } - /// Allow unparser to remove parenthesis according to the precedence rules of DataFusion. - /// This might make it invalid SQL for other SQL query engines with different precedence - /// rules, even if its valid for DataFusion. + /// Create pretty SQL output, better suited for human consumption + /// + /// See example on the struct level documentation + /// + /// # Pretty Output + /// + /// By default, `Unparser` generates SQL text that will parse back to the + /// same parsed [`Expr`], which is useful for creating machine readable + /// expressions to send to other systems. However, the resulting expressions are + /// not always nice to read for humans. + /// + /// For example + /// + /// ```sql + /// ((a + 4) > 5) + /// ``` + /// + /// This method removes parenthesis using to the precedence rules of + /// DataFusion. If the output is reparsed, the resulting [`Expr`] produces + /// same value as the original in DataFusion, but with a potentially + /// different order of operations. + /// + /// Note that this setting may create invalid SQL for other SQL query + /// engines with different precedence rules + /// + /// # Example + /// ``` + /// use datafusion_expr::{col, lit}; + /// use datafusion_sql::unparser::Unparser; + /// let expr = col("a").gt(lit(4)).and(col("b").lt(lit(5))); // form an expression `a > 4 AND b < 5` + /// let unparser = Unparser::default().with_pretty(true); + /// let sql = unparser.expr_to_sql(&expr).unwrap(); + /// assert_eq!(sql.to_string(), "a > 4 AND b < 5"); // note lack of parenthesis + /// ``` + /// + /// [`Expr`]: datafusion_expr::Expr pub fn with_pretty(mut self, pretty: bool) -> Self { self.pretty = pretty; self diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 15137403c582..41a8c968841b 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -33,10 +33,18 @@ use super::{ Unparser, }; -/// Convert a DataFusion [`LogicalPlan`] to `sqlparser::ast::Statement` +/// Convert a DataFusion [`LogicalPlan`] to [`ast::Statement`] /// -/// This function is the opposite of `SqlToRel::sql_statement_to_plan` and can -/// be used to, among other things, convert `LogicalPlan`s to strings. +/// This function is the opposite of [`SqlToRel::sql_statement_to_plan`] and can +/// be used to, among other things, to convert `LogicalPlan`s to SQL strings. +/// +/// # Errors +/// +/// This function returns an error if the plan cannot be converted to SQL. +/// +/// # See Also +/// +/// * [`expr_to_sql`] for converting [`Expr`], a single expression to SQL /// /// # Example /// ``` @@ -47,16 +55,20 @@ use super::{ /// Field::new("id", DataType::Utf8, false), /// Field::new("value", DataType::Utf8, false), /// ]); +/// // Scan 'table' and select columns 'id' and 'value' /// let plan = table_scan(Some("table"), &schema, None) /// .unwrap() /// .project(vec![col("id"), col("value")]) /// .unwrap() /// .build() /// .unwrap(); -/// let sql = plan_to_sql(&plan).unwrap(); -/// -/// assert_eq!(format!("{}", sql), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"") +/// let sql = plan_to_sql(&plan).unwrap(); // convert to AST +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"") /// ``` +/// +/// [`SqlToRel::sql_statement_to_plan`]: crate::planner::SqlToRel::sql_statement_to_plan +/// [`expr_to_sql`]: crate::unparser::expr_to_sql pub fn plan_to_sql(plan: &LogicalPlan) -> Result { let unparser = Unparser::default(); unparser.plan_to_sql(plan) From dc21a6c25893e7906da588debf18a8e5918b3b32 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 11:53:44 -0400 Subject: [PATCH 07/18] Minor: Consolidate specificataion doc sections (#11427) --- docs/source/contributor-guide/index.md | 16 ---------------- .../contributor-guide/specification/index.rst | 10 ++++++++++ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 891277f64757..ad49b614c334 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -134,19 +134,3 @@ The good thing about open code and open development is that any issues in one ch Pull requests will be marked with a `stale` label after 60 days of inactivity and then closed 7 days after that. Commenting on the PR will remove the `stale` label. - -## Specifications - -We formalize some DataFusion semantics and behaviors through specification -documents. These specifications are useful to be used as references to help -resolve ambiguities during development or code reviews. - -You are also welcome to propose changes to existing specifications or create -new specifications as you see fit. - -Here is the list current active specifications: - -- [Output field name semantic](https://datafusion.apache.org/contributor-guide/specification/output-field-name-semantic.html) -- [Invariants](https://datafusion.apache.org/contributor-guide/specification/invariants.html) - -All specifications are stored in the `docs/source/specification` folder. diff --git a/docs/source/contributor-guide/specification/index.rst b/docs/source/contributor-guide/specification/index.rst index bcd5a895c4d2..a34f0b19e4de 100644 --- a/docs/source/contributor-guide/specification/index.rst +++ b/docs/source/contributor-guide/specification/index.rst @@ -18,6 +18,16 @@ Specifications ============== +We formalize some DataFusion semantics and behaviors through specification +documents. These specifications are useful to be used as references to help +resolve ambiguities during development or code reviews. + +You are also welcome to propose changes to existing specifications or create +new specifications as you see fit. All specifications are stored in the +`docs/source/specification` folder. Here is the list current active +specifications: + + .. toctree:: :maxdepth: 1 From b075ac471e6d27dfe40b6586a72070a9ec4751a9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 15:27:16 -0400 Subject: [PATCH 08/18] Minor: consolidate doc roadmap pages (#11426) --- .../contributor-guide/quarterly_roadmap.md | 96 ------------------- docs/source/contributor-guide/roadmap.md | 81 ++++++++++++++++ docs/source/index.rst | 1 - 3 files changed, 81 insertions(+), 97 deletions(-) delete mode 100644 docs/source/contributor-guide/quarterly_roadmap.md diff --git a/docs/source/contributor-guide/quarterly_roadmap.md b/docs/source/contributor-guide/quarterly_roadmap.md deleted file mode 100644 index ee82617225aa..000000000000 --- a/docs/source/contributor-guide/quarterly_roadmap.md +++ /dev/null @@ -1,96 +0,0 @@ - - -# Quarterly Roadmap - -A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. - -## 2023 Q4 - -- Improve data output (`COPY`, `INSERT` and DataFrame) output capability [#6569](https://github.com/apache/datafusion/issues/6569) -- Implementation of `ARRAY` types and related functions [#6980](https://github.com/apache/datafusion/issues/6980) -- Write an industrial paper about DataFusion for SIGMOD [#6782](https://github.com/apache/datafusion/issues/6782) - -## 2022 Q2 - -### DataFusion Core - -- IO Improvements - - Reading, registering, and writing more file formats from both DataFrame API and SQL - - Additional options for IO including partitioning and metadata support -- Work Scheduling - - Improve predictability, observability and performance of IO and CPU-bound work - - Develop a more explicit story for managing parallelism during plan execution -- Memory Management - - Add more operators for memory limited execution -- Performance - - Incorporate row-format into operators such as aggregate - - Add row-format benchmarks - - Explore JIT-compiling complex expressions - - Explore LLVM for JIT, with inline Rust functions as the primary goal - - Improve performance of Sort and Merge using Row Format / JIT expressions -- Documentation - - General improvements to DataFusion website - - Publish design documents -- Streaming - - Create `StreamProvider` trait - -### Ballista - -- Make production ready - - Shuffle file cleanup - - Fill functional gaps between DataFusion and Ballista - - Improve task scheduling and data exchange efficiency - - Better error handling - - Task failure - - Executor lost - - Schedule restart - - Improve monitoring and logging - - Auto scaling support -- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. -- Executor deployment grouping based on resource allocation - -### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib)) - -#### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) - -- Add missing functionality to DataFrame and SessionContext -- Improve documentation - -#### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) - -- Create Python bindings to use with datafusion-python - -#### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) - -- Create multiple SQL editors -- Expose more Context and query metadata -- Support new data sources - - BigTable, HDFS, HTTP APIs - -#### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) - -- Python binding to use with datafusion-python -- Timestamp range predicate pushdown -- Multi-threaded partition aware execution -- Production ready Rust SDK - -#### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) - -- Create experimental implementation of `StreamProvider` trait diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md index a6d78d9311aa..3d9c1ee371fe 100644 --- a/docs/source/contributor-guide/roadmap.md +++ b/docs/source/contributor-guide/roadmap.md @@ -43,3 +43,84 @@ start a conversation using a github issue or the make review efficient and avoid surprises. [The current list of `EPIC`s can be found here](https://github.com/apache/datafusion/issues?q=is%3Aissue+is%3Aopen+epic). + +# Quarterly Roadmap + +A quarterly roadmap will be published to give the DataFusion community +visibility into the priorities of the projects contributors. This roadmap is not +binding and we would welcome any/all contributions to help keep this list up to +date. + +## 2023 Q4 + +- Improve data output (`COPY`, `INSERT` and DataFrame) output capability [#6569](https://github.com/apache/datafusion/issues/6569) +- Implementation of `ARRAY` types and related functions [#6980](https://github.com/apache/datafusion/issues/6980) +- Write an industrial paper about DataFusion for SIGMOD [#6782](https://github.com/apache/datafusion/issues/6782) + +## 2022 Q2 + +### DataFusion Core + +- IO Improvements + - Reading, registering, and writing more file formats from both DataFrame API and SQL + - Additional options for IO including partitioning and metadata support +- Work Scheduling + - Improve predictability, observability and performance of IO and CPU-bound work + - Develop a more explicit story for managing parallelism during plan execution +- Memory Management + - Add more operators for memory limited execution +- Performance + - Incorporate row-format into operators such as aggregate + - Add row-format benchmarks + - Explore JIT-compiling complex expressions + - Explore LLVM for JIT, with inline Rust functions as the primary goal + - Improve performance of Sort and Merge using Row Format / JIT expressions +- Documentation + - General improvements to DataFusion website + - Publish design documents +- Streaming + - Create `StreamProvider` trait + +### Ballista + +- Make production ready + - Shuffle file cleanup + - Fill functional gaps between DataFusion and Ballista + - Improve task scheduling and data exchange efficiency + - Better error handling + - Task failure + - Executor lost + - Schedule restart + - Improve monitoring and logging + - Auto scaling support +- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. +- Executor deployment grouping based on resource allocation + +### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib)) + +### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) + +- Add missing functionality to DataFrame and SessionContext +- Improve documentation + +### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) + +- Create Python bindings to use with datafusion-python + +### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) + +- Create multiple SQL editors +- Expose more Context and query metadata +- Support new data sources + - BigTable, HDFS, HTTP APIs + +### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) + +- Python binding to use with datafusion-python +- Timestamp range predicate pushdown +- Multi-threaded partition aware execution +- Production ready Rust SDK + +### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) + +- Create experimental implementation of `StreamProvider` trait diff --git a/docs/source/index.rst b/docs/source/index.rst index 8fbff208f561..ca6905c434f3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,7 +121,6 @@ To get started, see contributor-guide/testing contributor-guide/howtos contributor-guide/roadmap - contributor-guide/quarterly_roadmap contributor-guide/governance contributor-guide/inviting contributor-guide/specification/index From d5367f3ff5ed506e824a04c68120194deb68a908 Mon Sep 17 00:00:00 2001 From: Georgi Krastev Date: Fri, 12 Jul 2024 22:34:35 +0300 Subject: [PATCH 09/18] Avoid calling shutdown after failed write of AsyncWrite (#249) (#250) (#11415) in `serialize_rb_stream_to_object_store` --- .../file_format/write/orchestration.rs | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/write/orchestration.rs b/datafusion/core/src/datasource/file_format/write/orchestration.rs index a62b5715aeb3..8bd0dae9f5a4 100644 --- a/datafusion/core/src/datasource/file_format/write/orchestration.rs +++ b/datafusion/core/src/datasource/file_format/write/orchestration.rs @@ -42,15 +42,20 @@ use tokio::task::JoinSet; type WriterType = Box; type SerializerType = Arc; -/// Serializes a single data stream in parallel and writes to an ObjectStore -/// concurrently. Data order is preserved. In the event of an error, -/// the ObjectStore writer is returned to the caller in addition to an error, -/// so that the caller may handle aborting failed writes. +/// Serializes a single data stream in parallel and writes to an ObjectStore concurrently. +/// Data order is preserved. +/// +/// In the event of a non-IO error which does not involve the ObjectStore writer, +/// the writer returned to the caller in addition to the error, +/// so that failed writes may be aborted. +/// +/// In the event of an IO error involving the ObjectStore writer, +/// the writer is dropped to avoid calling further methods on it which might panic. pub(crate) async fn serialize_rb_stream_to_object_store( mut data_rx: Receiver, serializer: Arc, mut writer: WriterType, -) -> std::result::Result<(WriterType, u64), (WriterType, DataFusionError)> { +) -> std::result::Result<(WriterType, u64), (Option, DataFusionError)> { let (tx, mut rx) = mpsc::channel::>>(100); let serialize_task = SpawnedTask::spawn(async move { @@ -82,7 +87,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store( Ok(_) => (), Err(e) => { return Err(( - writer, + None, DataFusionError::Execution(format!( "Error writing to object store: {e}" )), @@ -93,12 +98,12 @@ pub(crate) async fn serialize_rb_stream_to_object_store( } Ok(Err(e)) => { // Return the writer along with the error - return Err((writer, e)); + return Err((Some(writer), e)); } Err(e) => { // Handle task panic or cancellation return Err(( - writer, + Some(writer), DataFusionError::Execution(format!( "Serialization task panicked or was cancelled: {e}" )), @@ -109,10 +114,10 @@ pub(crate) async fn serialize_rb_stream_to_object_store( match serialize_task.join().await { Ok(Ok(_)) => (), - Ok(Err(e)) => return Err((writer, e)), + Ok(Err(e)) => return Err((Some(writer), e)), Err(_) => { return Err(( - writer, + Some(writer), internal_datafusion_err!("Unknown error writing to object store"), )) } @@ -153,7 +158,7 @@ pub(crate) async fn stateless_serialize_and_write_files( row_count += cnt; } Err((writer, e)) => { - finished_writers.push(writer); + finished_writers.extend(writer); any_errors = true; triggering_error = Some(e); } From 02335ebe2dd36081e22ed2d8ab46287c6d950a5c Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 13 Jul 2024 03:50:22 +0800 Subject: [PATCH 10/18] Short term way to make `AggregateStatistics` still work when min/max is converted to udaf (#11261) * impl the short term solution. * add todos. --- .../aggregate_statistics.rs | 136 +++++++++++------- 1 file changed, 85 insertions(+), 51 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs index 7e9aec9e5e4c..66067d8cb5c4 100644 --- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs @@ -140,31 +140,29 @@ fn take_optimizable_column_and_table_count( stats: &Statistics, ) -> Option<(ScalarValue, String)> { let col_stats = &stats.column_statistics; - if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { - if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { - if let Precision::Exact(num_rows) = stats.num_rows { - let exprs = agg_expr.expressions(); - if exprs.len() == 1 { - // TODO optimize with exprs other than Column - if let Some(col_expr) = - exprs[0].as_any().downcast_ref::() - { - let current_val = &col_stats[col_expr.index()].null_count; - if let &Precision::Exact(val) = current_val { - return Some(( - ScalarValue::Int64(Some((num_rows - val) as i64)), - agg_expr.name().to_string(), - )); - } - } else if let Some(lit_expr) = - exprs[0].as_any().downcast_ref::() - { - if lit_expr.value() == &COUNT_STAR_EXPANSION { - return Some(( - ScalarValue::Int64(Some(num_rows as i64)), - agg_expr.name().to_string(), - )); - } + if is_non_distinct_count(agg_expr) { + if let Precision::Exact(num_rows) = stats.num_rows { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { + // TODO optimize with exprs other than Column + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() + { + let current_val = &col_stats[col_expr.index()].null_count; + if let &Precision::Exact(val) = current_val { + return Some(( + ScalarValue::Int64(Some((num_rows - val) as i64)), + agg_expr.name().to_string(), + )); + } + } else if let Some(lit_expr) = + exprs[0].as_any().downcast_ref::() + { + if lit_expr.value() == &COUNT_STAR_EXPANSION { + return Some(( + ScalarValue::Int64(Some(num_rows as i64)), + agg_expr.name().to_string(), + )); } } } @@ -182,26 +180,22 @@ fn take_optimizable_min( match *num_rows { 0 => { // MIN/MAX with 0 rows is always null - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { + if is_min(agg_expr) { if let Ok(min_data_type) = - ScalarValue::try_from(casted_expr.field().unwrap().data_type()) + ScalarValue::try_from(agg_expr.field().unwrap().data_type()) { - return Some((min_data_type, casted_expr.name().to_string())); + return Some((min_data_type, agg_expr.name().to_string())); } } } value if value > 0 => { let col_stats = &stats.column_statistics; - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { - if casted_expr.expressions().len() == 1 { + if is_min(agg_expr) { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() { if let Precision::Exact(val) = &col_stats[col_expr.index()].min_value @@ -209,7 +203,7 @@ fn take_optimizable_min( if !val.is_null() { return Some(( val.clone(), - casted_expr.name().to_string(), + agg_expr.name().to_string(), )); } } @@ -232,26 +226,22 @@ fn take_optimizable_max( match *num_rows { 0 => { // MIN/MAX with 0 rows is always null - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { + if is_max(agg_expr) { if let Ok(max_data_type) = - ScalarValue::try_from(casted_expr.field().unwrap().data_type()) + ScalarValue::try_from(agg_expr.field().unwrap().data_type()) { - return Some((max_data_type, casted_expr.name().to_string())); + return Some((max_data_type, agg_expr.name().to_string())); } } } value if value > 0 => { let col_stats = &stats.column_statistics; - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { - if casted_expr.expressions().len() == 1 { + if is_max(agg_expr) { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() { if let Precision::Exact(val) = &col_stats[col_expr.index()].max_value @@ -259,7 +249,7 @@ fn take_optimizable_max( if !val.is_null() { return Some(( val.clone(), - casted_expr.name().to_string(), + agg_expr.name().to_string(), )); } } @@ -273,6 +263,50 @@ fn take_optimizable_max( None } +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_non_distinct_count(agg_expr: &dyn AggregateExpr) -> bool { + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { + return true; + } + } + + false +} + +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_min(agg_expr: &dyn AggregateExpr) -> bool { + if agg_expr.as_any().is::() { + return true; + } + + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "min" { + return true; + } + } + + false +} + +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_max(agg_expr: &dyn AggregateExpr) -> bool { + if agg_expr.as_any().is::() { + return true; + } + + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "max" { + return true; + } + } + + false +} + #[cfg(test)] pub(crate) mod tests { use super::*; From bd25e26747a271752b7f46aa0970022525eff05b Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:51:01 -0700 Subject: [PATCH 11/18] Implement TPCH substrait integration test, support tpch_13, tpch_14, tpch_16 (#11405) optimize code --- .../tests/cases/consumer_integration.rs | 86 +- .../tpch_substrait_plans/query_13.json | 624 +++++++++ .../tpch_substrait_plans/query_14.json | 924 +++++++++++++ .../tpch_substrait_plans/query_16.json | 1175 +++++++++++++++++ 4 files changed, 2808 insertions(+), 1 deletion(-) create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 10c1319b903b..c8130220ef4a 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -40,7 +40,6 @@ mod tests { } Ok(ctx) } - #[tokio::test] async fn tpch_test_1() -> Result<()> { let ctx = create_context(vec![( @@ -314,4 +313,89 @@ mod tests { \n TableScan: FILENAME_PLACEHOLDER_2 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); Ok(()) } + + // missing query 12 + #[tokio::test] + async fn tpch_test_13() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/orders.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_13.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey) AS C_COUNT, count(Int64(1)) AS CUSTDIST\ + \n Sort: count(Int64(1)) DESC NULLS FIRST, count(FILENAME_PLACEHOLDER_1.o_orderkey) DESC NULLS FIRST\ + \n Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey), count(Int64(1))\ + \n Aggregate: groupBy=[[count(FILENAME_PLACEHOLDER_1.o_orderkey)]], aggr=[[count(Int64(1))]]\ + \n Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey)\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.c_custkey]], aggr=[[count(FILENAME_PLACEHOLDER_1.o_orderkey)]]\ + \n Projection: FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_1.o_orderkey\ + \n Left Join: FILENAME_PLACEHOLDER_0.c_custkey = FILENAME_PLACEHOLDER_1.o_custkey Filter: NOT FILENAME_PLACEHOLDER_1.o_comment LIKE CAST(Utf8(\"%special%requests%\") AS Utf8)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_14() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_14.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ + \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ + \n Projection: CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount) ELSE Decimal128(Some(0),19,0) END, FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ + \n Filter: FILENAME_PLACEHOLDER_0.l_partkey = FILENAME_PLACEHOLDER_1.p_partkey AND FILENAME_PLACEHOLDER_0.l_shipdate >= Date32(\"1995-09-01\") AND FILENAME_PLACEHOLDER_0.l_shipdate < CAST(Utf8(\"1995-10-01\") AS Date32)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); + Ok(()) + } + // query 15 is missing + #[tokio::test] + async fn tpch_test_16() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/partsupp.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/supplier.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_16.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_1.p_brand AS P_BRAND, FILENAME_PLACEHOLDER_1.p_type AS P_TYPE, FILENAME_PLACEHOLDER_1.p_size AS P_SIZE, count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey) AS SUPPLIER_CNT\ + \n Sort: count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey) DESC NULLS FIRST, FILENAME_PLACEHOLDER_1.p_brand ASC NULLS LAST, FILENAME_PLACEHOLDER_1.p_type ASC NULLS LAST, FILENAME_PLACEHOLDER_1.p_size ASC NULLS LAST\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_1.p_brand, FILENAME_PLACEHOLDER_1.p_type, FILENAME_PLACEHOLDER_1.p_size]], aggr=[[count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey)]]\ + \n Projection: FILENAME_PLACEHOLDER_1.p_brand, FILENAME_PLACEHOLDER_1.p_type, FILENAME_PLACEHOLDER_1.p_size, FILENAME_PLACEHOLDER_0.ps_suppkey\ + \n Filter: FILENAME_PLACEHOLDER_1.p_partkey = FILENAME_PLACEHOLDER_0.ps_partkey AND FILENAME_PLACEHOLDER_1.p_brand != CAST(Utf8(\"Brand#45\") AS Utf8) AND NOT FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"MEDIUM POLISHED%\") AS Utf8) AND (FILENAME_PLACEHOLDER_1.p_size = Int32(49) OR FILENAME_PLACEHOLDER_1.p_size = Int32(14) OR FILENAME_PLACEHOLDER_1.p_size = Int32(23) OR FILENAME_PLACEHOLDER_1.p_size = Int32(45) OR FILENAME_PLACEHOLDER_1.p_size = Int32(19) OR FILENAME_PLACEHOLDER_1.p_size = Int32(3) OR FILENAME_PLACEHOLDER_1.p_size = Int32(36) OR FILENAME_PLACEHOLDER_1.p_size = Int32(9)) AND NOT CAST(FILENAME_PLACEHOLDER_0.ps_suppkey IN () AS Boolean)\ + \n Subquery:\ + \n Projection: FILENAME_PLACEHOLDER_2.s_suppkey\ + \n Filter: FILENAME_PLACEHOLDER_2.s_comment LIKE CAST(Utf8(\"%Customer%Complaints%\") AS Utf8)\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); + Ok(()) + } } diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json new file mode 100644 index 000000000000..c88e61e78304 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json @@ -0,0 +1,624 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 4, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 2, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "count:opt_any" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "count:opt" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2, + 3 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 17, + 18 + ] + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "%special%requests%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + ] + } + }, + "type": "JOIN_TYPE_LEFT" + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 4, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [] + } + } + ] + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + } + ] + } + }, + "names": [ + "C_COUNT", + "CUSTDIST" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json new file mode 100644 index 000000000000..380b71df8aac --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json @@ -0,0 +1,924 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 5, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gte:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "lt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 5, + "name": "multiply:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 6, + "name": "subtract:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 7, + "name": "sum:opt_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 8, + "name": "divide:opt_decimal_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 25, + 26 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "date": 9374, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1995-10-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "ifThen": { + "ifs": [ + { + "if": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 20 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "PROMO%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + }, + "then": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + } + ], + "else": { + "literal": { + "decimal": { + "value": "AAAAAAAAAAAAAAAAAAAAAA==", + "precision": 19, + "scale": 0 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + } + }, + { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + }, + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 8, + "args": [], + "outputType": { + "decimal": { + "scale": 2, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 2, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "literal": { + "decimal": { + "value": "ECcAAAAAAAAAAAAAAAAAAA==", + "precision": 5, + "scale": 2 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "names": [ + "PROMO_REVENUE" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json new file mode 100644 index 000000000000..f988aa7a76a2 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json @@ -0,0 +1,1175 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 4, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "not_equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 3, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 4, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 5, + "name": "or:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "count:opt_any" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 14, + 15, + 16, + 17 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "PS_PARTKEY", + "PS_SUPPKEY", + "PS_AVAILQTY", + "PS_SUPPLYCOST", + "PS_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 199, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "Brand#45", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "MEDIUM POLISHED%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 49, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 14, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 23, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 45, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 19, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 3, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 36, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 9, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "subquery": { + "inPredicate": { + "needles": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ], + "haystack": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 7 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "S_SUPPKEY", + "S_NAME", + "S_ADDRESS", + "S_NATIONKEY", + "S_PHONE", + "S_ACCTBAL", + "S_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "%Customer%Complaints%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + } + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_DISTINCT", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "names": [ + "P_BRAND", + "P_TYPE", + "P_SIZE", + "SUPPLIER_CNT" + ] + } + } + ], + "expectedTypeUrls": [] +} From 4d04a6ebbb0458495d2282df34e8b22001f3971d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 15:51:44 -0400 Subject: [PATCH 12/18] Minor: fix labeler rules (#11428) --- .github/workflows/dev_pr/labeler.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 34a37948785b..308abd1688a6 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -17,11 +17,11 @@ development-process: - changed-files: - - any-glob-to-any-file: ['dev/**.*', '.github/**.*', 'ci/**.*', '.asf.yaml'] + - any-glob-to-any-file: ['dev/**/*', '.github/**/*', 'ci/**/*', '.asf.yaml'] documentation: - changed-files: - - any-glob-to-any-file: ['docs/**.*', 'README.md', './**/README.md', 'DEVELOPERS.md', 'datafusion/docs/**.*'] + - any-glob-to-any-file: ['docs/**/*', 'README.md', './**/README.md', 'DEVELOPERS.md', 'datafusion/docs/**/*'] sql: - changed-files: From 8f8df07c80aa66bb94d57c9619be93f9c3be92a9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 23:14:17 -0400 Subject: [PATCH 13/18] Minor: change internal error to not supported error for nested field access (#11446) --- datafusion/sql/src/expr/identifier.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index d297b2e4df5b..39736b1fbba5 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -18,8 +18,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::Field; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, - ScalarValue, TableReference, + internal_err, not_impl_err, plan_datafusion_err, Column, DFSchema, DataFusionError, + Result, ScalarValue, TableReference, }; use datafusion_expr::{expr::ScalarFunction, lit, Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -118,7 +118,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Though ideally once that support is in place, this code should work with it // TODO: remove when can support multiple nested identifiers if ids.len() > 5 { - return internal_err!("Unsupported compound identifier: {ids:?}"); + return not_impl_err!("Compound identifier: {ids:?}"); } let search_result = search_dfschema(&ids, schema); @@ -127,7 +127,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Some((field, qualifier, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support multiple nested identifiers if nested_names.len() > 1 { - return internal_err!( + return not_impl_err!( "Nested identifiers not yet supported for column {}", Column::from((qualifier, field)).quoted_flat_name() ); @@ -154,7 +154,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field if ids.len() == 5 { - internal_err!("Unsupported compound identifier: {ids:?}") + not_impl_err!("compound identifier: {ids:?}") } else { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { @@ -165,7 +165,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if !nested_names.is_empty() => { // TODO: remove when can support nested identifiers for OuterReferenceColumn - internal_err!( + not_impl_err!( "Nested identifiers are not yet supported for OuterReferenceColumn {}", Column::from((qualifier, field)).quoted_flat_name() ) From 9e4a4a1599b9def33f27a6f82dd32045038de296 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 13 Jul 2024 05:33:51 -0400 Subject: [PATCH 14/18] Minor: change Datafusion --> DataFusion in docs (#11439) * Minor: change Datafusion --> DataFusion in docs * update expected --- datafusion-examples/README.md | 4 ++-- datafusion-examples/examples/expr_api.rs | 2 +- datafusion/common/src/config.rs | 2 +- datafusion/core/src/dataframe/mod.rs | 2 +- datafusion/expr/src/signature.rs | 2 +- datafusion/optimizer/src/unwrap_cast_in_comparison.rs | 2 +- datafusion/physical-expr/src/intervals/cp_solver.rs | 2 +- datafusion/physical-plan/src/aggregates/mod.rs | 2 +- datafusion/sql/src/parser.rs | 2 +- datafusion/sqllogictest/README.md | 2 +- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- datafusion/sqllogictest/test_files/window.slt | 6 +++--- docs/source/contributor-guide/inviting.md | 2 +- docs/source/user-guide/configs.md | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 2696f74775cf..da01f60b527d 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -71,8 +71,8 @@ cargo run --example dataframe - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution -- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into Datafusion `Expr`. -- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from Datafusion `Expr` and `LogicalPlan` +- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`. +- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan` - [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 43729a913e5d..a5cf7011f811 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -83,7 +83,7 @@ async fn main() -> Result<()> { Ok(()) } -/// Datafusion's `expr_fn` API makes it easy to create [`Expr`]s for the +/// DataFusion's `expr_fn` API makes it easy to create [`Expr`]s for the /// full range of expression types such as aggregates and window functions. fn expr_fn_demo() -> Result<()> { // Let's say you want to call the "first_value" aggregate function diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1d2a9589adfc..880f0119ce0d 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -309,7 +309,7 @@ config_namespace! { /// Currently experimental pub split_file_groups_by_statistics: bool, default = false - /// Should Datafusion keep the columns used for partition_by in the output RecordBatches + /// Should DataFusion keep the columns used for partition_by in the output RecordBatches pub keep_partition_by_columns: bool, default = false } } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index d0f2852a6e53..05a08a637893 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1472,7 +1472,7 @@ impl DataFrame { /// /// The method supports case sensitive rename with wrapping column name into one of following symbols ( " or ' or ` ) /// - /// Alternatively setting Datafusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable + /// Alternatively setting DataFusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable /// case sensitive rename without need to wrap column name into special symbols /// /// # Example diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 33f643eb2dc2..fba793dd229d 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -93,7 +93,7 @@ pub enum TypeSignature { Variadic(Vec), /// The acceptable signature and coercions rules to coerce arguments to this /// signature are special for this function. If this signature is specified, - /// Datafusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types. + /// DataFusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types. /// /// [`ScalarUDFImpl::coerce_types`]: crate::udf::ScalarUDFImpl::coerce_types UserDefined, diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 344708252559..9941da9dd65e 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -1080,7 +1080,7 @@ mod tests { ), }; - // Datafusion ignores timezones for comparisons of ScalarValue + // DataFusion ignores timezones for comparisons of ScalarValue // so double check it here assert_eq!(lit_tz_none, lit_tz_utc); diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index fc4950ae4e7c..f05ac3624b8e 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -176,7 +176,7 @@ impl ExprIntervalGraphNode { &self.interval } - /// This function creates a DAEG node from Datafusion's [`ExprTreeNode`] + /// This function creates a DAEG node from DataFusion's [`ExprTreeNode`] /// object. Literals are created with definite, singleton intervals while /// any other expression starts with an indefinite interval ([-∞, ∞]). pub fn make_node(node: &ExprTreeNode, schema: &Schema) -> Result { diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 8caf10acf09b..8bf808af3b5b 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -324,7 +324,7 @@ impl AggregateExec { /// Create a new hash aggregate execution plan with the given schema. /// This constructor isn't part of the public API, it is used internally - /// by Datafusion to enforce schema consistency during when re-creating + /// by DataFusion to enforce schema consistency during when re-creating /// `AggregateExec`s inside optimization rules. Schema field names of an /// `AggregateExec` depends on the names of aggregate expressions. Since /// a rule may re-write aggregate expressions (e.g. reverse them) during diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 5da7f7176509..8147092c34ab 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -253,7 +253,7 @@ fn ensure_not_set(field: &Option, name: &str) -> Result<(), ParserError> { Ok(()) } -/// Datafusion SQL Parser based on [`sqlparser`] +/// DataFusion SQL Parser based on [`sqlparser`] /// /// Parses DataFusion's SQL dialect, often delegating to [`sqlparser`]'s [`Parser`]. /// diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 930df4796776..c7f04c0d762c 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -225,7 +225,7 @@ query ``` -- `test_name`: Uniquely identify the test name (Datafusion only) +- `test_name`: Uniquely identify the test name (DataFusion only) - `type_string`: A short string that specifies the number of result columns and the expected datatype of each result column. There is one character in the for each result column. The characters codes are: - 'B' - **B**oolean, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index acd465a0c021..95bea1223a9c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -257,7 +257,7 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics false Should DataFusion collect statistics after listing files datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs -datafusion.execution.keep_partition_by_columns false Should Datafusion keep the columns used for partition_by in the output RecordBatches +datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 7f2e766aab91..a865a7ccbd8f 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -2236,7 +2236,7 @@ SELECT SUM(c12) OVER(ORDER BY c1, c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) 7.728066219895 NULL # test_c9_rn_ordering_alias -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, @@ -2275,7 +2275,7 @@ SELECT c9, rn1 FROM (SELECT c9, 145294611 5 # test_c9_rn_ordering_alias_opposite_direction -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, @@ -2314,7 +2314,7 @@ SELECT c9, rn1 FROM (SELECT c9, 4076864659 5 # test_c9_rn_ordering_alias_opposite_direction2 -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan _should_ have a SortExec after BoundedWindowAggExec since the table after BoundedWindowAggExec is ordered by rn1 ASC and c9 DESC, which is conflicting with the requirement rn1 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, diff --git a/docs/source/contributor-guide/inviting.md b/docs/source/contributor-guide/inviting.md index 967f417e6e9a..4066dd9699ee 100644 --- a/docs/source/contributor-guide/inviting.md +++ b/docs/source/contributor-guide/inviting.md @@ -59,7 +59,7 @@ the person. Here is an example: To: private@datafusion.apache.org Subject: [DISCUSS] $PERSONS_NAME for Committer -$PERSONS_NAME has been an active contributor to the Datafusion community for the +$PERSONS_NAME has been an active contributor to the DataFusion community for the last 6 months[1][2], helping others, answering questions, and improving the project's code. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 579088f991ef..5130b0a56d0e 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -86,7 +86,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | | datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | | datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | -| datafusion.execution.keep_partition_by_columns | false | Should Datafusion keep the columns used for partition_by in the output RecordBatches | +| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | From 08fa444aaa8513a60ede5c57d92f29e6156b91a8 Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Sat, 13 Jul 2024 17:34:45 +0800 Subject: [PATCH 15/18] fix: make sure JOIN ON expression is boolean type (#11423) * fix: make sure JOIN ON expression is boolean type * Applied to DataFrame * Update datafusion/optimizer/src/analyzer/type_coercion.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 31 +++++++++++++++++-- .../optimizer/src/analyzer/type_coercion.rs | 17 +++++++++- datafusion/sqllogictest/test_files/join.slt | 12 ++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 05a08a637893..c55b7c752765 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -896,9 +896,8 @@ impl DataFrame { join_type: JoinType, on_exprs: impl IntoIterator, ) -> Result { - let expr = on_exprs.into_iter().reduce(Expr::and); let plan = LogicalPlanBuilder::from(self.plan) - .join_on(right.plan, join_type, expr)? + .join_on(right.plan, join_type, on_exprs)? .build()?; Ok(DataFrame { session_state: self.session_state, @@ -1694,7 +1693,7 @@ mod tests { use crate::test_util::{register_aggregate_csv, test_table, test_table_with_name}; use arrow::array::{self, Int32Array}; - use datafusion_common::{Constraint, Constraints}; + use datafusion_common::{Constraint, Constraints, ScalarValue}; use datafusion_common_runtime::SpawnedTask; use datafusion_expr::{ array_agg, cast, create_udf, expr, lit, BuiltInWindowFunction, @@ -2555,6 +2554,32 @@ mod tests { Ok(()) } + #[tokio::test] + async fn join_on_filter_datatype() -> Result<()> { + let left = test_table_with_name("a").await?.select_columns(&["c1"])?; + let right = test_table_with_name("b").await?.select_columns(&["c1"])?; + + // JOIN ON untyped NULL + let join = left.clone().join_on( + right.clone(), + JoinType::Inner, + Some(Expr::Literal(ScalarValue::Null)), + )?; + let expected_plan = "CrossJoin:\ + \n TableScan: a projection=[c1], full_filters=[Boolean(NULL)]\ + \n TableScan: b projection=[c1]"; + assert_eq!(expected_plan, format!("{:?}", join.into_optimized_plan()?)); + + // JOIN ON expression must be boolean type + let join = left.join_on(right, JoinType::Inner, Some(lit("TRUE")))?; + let expected = join.into_optimized_plan().unwrap_err(); + assert_eq!( + expected.strip_backtrace(), + "type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" + ); + Ok(()) + } + #[tokio::test] async fn join_ambiguous_filter() -> Result<()> { let left = test_table_with_name("a") diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 3cab474df84e..80a8c864e431 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -127,7 +127,7 @@ impl<'a> TypeCoercionRewriter<'a> { Self { schema } } - /// Coerce join equality expressions + /// Coerce join equality expressions and join filter /// /// Joins must be treated specially as their equality expressions are stored /// as a parallel list of left and right expressions, rather than a single @@ -151,9 +151,24 @@ impl<'a> TypeCoercionRewriter<'a> { }) .collect::>>()?; + // Join filter must be boolean + join.filter = join + .filter + .map(|expr| self.coerce_join_filter(expr)) + .transpose()?; + Ok(LogicalPlan::Join(join)) } + fn coerce_join_filter(&self, expr: Expr) -> Result { + let expr_type = expr.get_type(self.schema)?; + match expr_type { + DataType::Boolean => Ok(expr), + DataType::Null => expr.cast_to(&DataType::Boolean, self.schema), + other => plan_err!("Join condition must be boolean type, but got {other:?}"), + } + } + fn coerce_binary_op( &self, left: Expr, diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 12cb8b3985c7..efebba1779cf 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -988,7 +988,6 @@ statement ok DROP TABLE department -# Test issue: https://github.com/apache/datafusion/issues/11269 statement ok CREATE TABLE t1 (v0 BIGINT) AS VALUES (-503661263); @@ -998,11 +997,22 @@ CREATE TABLE t2 (v0 DOUBLE) AS VALUES (-1.663563947387); statement ok CREATE TABLE t3 (v0 DOUBLE) AS VALUES (0.05112015193508901); +# Test issue: https://github.com/apache/datafusion/issues/11269 query RR SELECT t3.v0, t2.v0 FROM t1,t2,t3 WHERE t3.v0 >= t1.v0; ---- 0.051120151935 -1.663563947387 +# Test issue: https://github.com/apache/datafusion/issues/11414 +query IRR +SELECT * FROM t1 INNER JOIN t2 ON NULL RIGHT JOIN t3 ON TRUE; +---- +NULL NULL 0.051120151935 + +# ON expression must be boolean type +query error DataFusion error: type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8 +SELECT * FROM t1 INNER JOIN t2 ON 'TRUE' + statement ok DROP TABLE t1; From f5d88d1790eea85910ae5590a353ae17318f8401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Sun, 14 Jul 2024 05:44:32 +0800 Subject: [PATCH 16/18] Support serialization/deserialization for custom physical exprs in proto (#11387) * Add PhysicalExtensionExprNode * regen proto * Add ser/de extension expr logic * Add test and fix clippy lint --- datafusion/proto/proto/datafusion.proto | 7 + datafusion/proto/src/generated/pbjson.rs | 124 +++++++++++++++ datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/physical_plan/from_proto.rs | 8 + datafusion/proto/src/physical_plan/mod.rs | 16 ++ .../proto/src/physical_plan/to_proto.rs | 19 ++- .../tests/cases/roundtrip_physical_plan.rs | 147 +++++++++++++++++- 7 files changed, 330 insertions(+), 3 deletions(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 345765b08be3..9ef884531e32 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -836,6 +836,8 @@ message PhysicalExprNode { // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; PhysicalLikeExprNode like_expr = 18; + + PhysicalExtensionExprNode extension = 19; } } @@ -942,6 +944,11 @@ message PhysicalNegativeNode { PhysicalExprNode expr = 1; } +message PhysicalExtensionExprNode { + bytes expr = 1; + repeated PhysicalExprNode inputs = 2; +} + message FilterExecNode { PhysicalPlanNode input = 1; PhysicalExprNode expr = 2; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 905f0d984955..fa989480fad9 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -13543,6 +13543,9 @@ impl serde::Serialize for PhysicalExprNode { physical_expr_node::ExprType::LikeExpr(v) => { struct_ser.serialize_field("likeExpr", v)?; } + physical_expr_node::ExprType::Extension(v) => { + struct_ser.serialize_field("extension", v)?; + } } } struct_ser.end() @@ -13582,6 +13585,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { "scalarUdf", "like_expr", "likeExpr", + "extension", ]; #[allow(clippy::enum_variant_names)] @@ -13602,6 +13606,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { WindowExpr, ScalarUdf, LikeExpr, + Extension, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -13639,6 +13644,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { "windowExpr" | "window_expr" => Ok(GeneratedField::WindowExpr), "scalarUdf" | "scalar_udf" => Ok(GeneratedField::ScalarUdf), "likeExpr" | "like_expr" => Ok(GeneratedField::LikeExpr), + "extension" => Ok(GeneratedField::Extension), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -13771,6 +13777,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { return Err(serde::de::Error::duplicate_field("likeExpr")); } expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::LikeExpr) +; + } + GeneratedField::Extension => { + if expr_type__.is_some() { + return Err(serde::de::Error::duplicate_field("extension")); + } + expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::Extension) ; } } @@ -13783,6 +13796,117 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { deserializer.deserialize_struct("datafusion.PhysicalExprNode", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for PhysicalExtensionExprNode { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.expr.is_empty() { + len += 1; + } + if !self.inputs.is_empty() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExtensionExprNode", len)?; + if !self.expr.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("expr", pbjson::private::base64::encode(&self.expr).as_str())?; + } + if !self.inputs.is_empty() { + struct_ser.serialize_field("inputs", &self.inputs)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for PhysicalExtensionExprNode { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "expr", + "inputs", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Expr, + Inputs, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "expr" => Ok(GeneratedField::Expr), + "inputs" => Ok(GeneratedField::Inputs), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = PhysicalExtensionExprNode; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.PhysicalExtensionExprNode") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut expr__ = None; + let mut inputs__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Expr => { + if expr__.is_some() { + return Err(serde::de::Error::duplicate_field("expr")); + } + expr__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::Inputs => { + if inputs__.is_some() { + return Err(serde::de::Error::duplicate_field("inputs")); + } + inputs__ = Some(map_.next_value()?); + } + } + } + Ok(PhysicalExtensionExprNode { + expr: expr__.unwrap_or_default(), + inputs: inputs__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.PhysicalExtensionExprNode", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for PhysicalExtensionNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index b16d26ee6e1e..8407e545fe65 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -1218,7 +1218,7 @@ pub struct PhysicalExtensionNode { pub struct PhysicalExprNode { #[prost( oneof = "physical_expr_node::ExprType", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19" )] pub expr_type: ::core::option::Option, } @@ -1266,6 +1266,8 @@ pub mod physical_expr_node { ScalarUdf(super::PhysicalScalarUdfNode), #[prost(message, tag = "18")] LikeExpr(::prost::alloc::boxed::Box), + #[prost(message, tag = "19")] + Extension(super::PhysicalExtensionExprNode), } } #[allow(clippy::derive_partial_eq_without_eq)] @@ -1456,6 +1458,14 @@ pub struct PhysicalNegativeNode { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct PhysicalExtensionExprNode { + #[prost(bytes = "vec", tag = "1")] + pub expr: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "2")] + pub inputs: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct FilterExecNode { #[prost(message, optional, boxed, tag = "1")] pub input: ::core::option::Option<::prost::alloc::boxed::Box>, diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index e94bb3b8efcb..52fbd5cbdcf6 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -394,6 +394,14 @@ pub fn parse_physical_expr( codec, )?, )), + ExprType::Extension(extension) => { + let inputs: Vec> = extension + .inputs + .iter() + .map(|e| parse_physical_expr(e, registry, input_schema, codec)) + .collect::>()?; + (codec.try_decode_expr(extension.expr.as_slice(), &inputs)?) as _ + } }; Ok(pexpr) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 56e702704798..e5429945e97e 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -2018,6 +2018,22 @@ pub trait PhysicalExtensionCodec: Debug + Send + Sync { fn try_encode_udf(&self, _node: &ScalarUDF, _buf: &mut Vec) -> Result<()> { Ok(()) } + + fn try_decode_expr( + &self, + _buf: &[u8], + _inputs: &[Arc], + ) -> Result> { + not_impl_err!("PhysicalExtensionCodec is not provided") + } + + fn try_encode_expr( + &self, + _node: Arc, + _buf: &mut Vec, + ) -> Result<()> { + not_impl_err!("PhysicalExtensionCodec is not provided") + } } #[derive(Debug)] diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 5e982ad2afde..9c95acc1dcf4 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -495,7 +495,24 @@ pub fn serialize_physical_expr( ))), }) } else { - internal_err!("physical_plan::to_proto() unsupported expression {value:?}") + let mut buf: Vec = vec![]; + match codec.try_encode_expr(Arc::clone(&value), &mut buf) { + Ok(_) => { + let inputs: Vec = value + .children() + .into_iter() + .map(|e| serialize_physical_expr(Arc::clone(e), codec)) + .collect::>()?; + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Extension( + protobuf::PhysicalExtensionExprNode { expr: buf, inputs }, + )), + }) + } + Err(e) => internal_err!( + "Unsupported physical expr and extension codec failed with [{e}]. Expr: {value:?}" + ), + } } } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index d8d85ace1a29..2fcc65008fd8 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::RecordBatch; use std::any::Any; +use std::fmt::Display; +use std::hash::Hasher; use std::ops::Deref; use std::sync::Arc; use std::vec; @@ -38,6 +41,7 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::{create_udf, JoinType, Operator, Volatility}; +use datafusion::physical_expr::aggregate::utils::down_cast_any_ref; use datafusion::physical_expr::expressions::Max; use datafusion::physical_expr::window::SlidingAggregateWindowExpr; use datafusion::physical_expr::{PhysicalSortRequirement, ScalarFunctionExpr}; @@ -75,7 +79,7 @@ use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; -use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; use datafusion_expr::{ Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, WindowFrame, WindowFrameBound, @@ -658,6 +662,147 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { roundtrip_test(ParquetExec::builder(scan_config).build_arc()) } +#[test] +fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { + let scan_config = FileScanConfig { + object_store_url: ObjectStoreUrl::local_filesystem(), + file_schema: Arc::new(Schema::new(vec![Field::new( + "col", + DataType::Utf8, + false, + )])), + file_groups: vec![vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )]], + statistics: Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ + Field::new("col", DataType::Utf8, false), + ]))), + }, + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: vec![], + }; + + #[derive(Debug, Hash, Clone)] + struct CustomPredicateExpr { + inner: Arc, + } + impl Display for CustomPredicateExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CustomPredicateExpr") + } + } + impl PartialEq for CustomPredicateExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| self.inner.eq(&x.inner)) + .unwrap_or(false) + } + } + impl PhysicalExpr for CustomPredicateExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn data_type(&self, _input_schema: &Schema) -> Result { + unreachable!() + } + + fn nullable(&self, _input_schema: &Schema) -> Result { + unreachable!() + } + + fn evaluate(&self, _batch: &RecordBatch) -> Result { + unreachable!() + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.inner] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + todo!() + } + + fn dyn_hash(&self, _state: &mut dyn Hasher) { + unreachable!() + } + } + + #[derive(Debug)] + struct CustomPhysicalExtensionCodec; + impl PhysicalExtensionCodec for CustomPhysicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[Arc], + _registry: &dyn FunctionRegistry, + ) -> Result> { + unreachable!() + } + + fn try_encode( + &self, + _node: Arc, + _buf: &mut Vec, + ) -> Result<()> { + unreachable!() + } + + fn try_decode_expr( + &self, + buf: &[u8], + inputs: &[Arc], + ) -> Result> { + if buf == "CustomPredicateExpr".as_bytes() { + Ok(Arc::new(CustomPredicateExpr { + inner: inputs[0].clone(), + })) + } else { + internal_err!("Not supported") + } + } + + fn try_encode_expr( + &self, + node: Arc, + buf: &mut Vec, + ) -> Result<()> { + if node + .as_ref() + .as_any() + .downcast_ref::() + .is_some() + { + buf.extend_from_slice("CustomPredicateExpr".as_bytes()); + Ok(()) + } else { + internal_err!("Not supported") + } + } + } + + let custom_predicate_expr = Arc::new(CustomPredicateExpr { + inner: Arc::new(Column::new("col", 1)), + }); + let exec_plan = ParquetExec::builder(scan_config) + .with_predicate(custom_predicate_expr) + .build_arc(); + + let ctx = SessionContext::new(); + roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?; + Ok(()) +} + #[test] fn roundtrip_scalar_udf() -> Result<()> { let field_a = Field::new("a", DataType::Int64, false); From a43cf79bf0b133379ee6f2a236c025e59a5ef822 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+Kev1n8@users.noreply.github.com> Date: Sun, 14 Jul 2024 05:45:03 +0800 Subject: [PATCH 17/18] remove termtree dependency (#11416) * remove termtree dependency * impl Display for TopKHeap, replace uses of tree_print in tests * use to_string instead of format! --- datafusion/physical-plan/Cargo.toml | 1 - .../physical-plan/src/aggregates/topk/heap.rs | 86 ++++++++++++------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index f5f756417ebf..00fc81ebde97 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -66,7 +66,6 @@ tokio = { workspace = true } [dev-dependencies] rstest = { workspace = true } rstest_reuse = "0.7.0" -termtree = "0.5.0" tokio = { workspace = true, features = [ "rt-multi-thread", "fs", diff --git a/datafusion/physical-plan/src/aggregates/topk/heap.rs b/datafusion/physical-plan/src/aggregates/topk/heap.rs index 51593f5c28ce..81eadbc018b3 100644 --- a/datafusion/physical-plan/src/aggregates/topk/heap.rs +++ b/datafusion/physical-plan/src/aggregates/topk/heap.rs @@ -27,7 +27,7 @@ use datafusion_common::Result; use datafusion_physical_expr::aggregate::utils::adjust_output_array; use half::f16; use std::cmp::Ordering; -use std::fmt::{Debug, Formatter}; +use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; /// A custom version of `Ord` that only exists to we can implement it for the Values in our heap @@ -323,29 +323,53 @@ impl TopKHeap { } } - #[cfg(test)] - fn _tree_print(&self, idx: usize) -> Option> { - let hi = self.heap.get(idx)?; - match hi { - None => None, - Some(hi) => { - let label = - format!("val={:?} idx={}, bucket={}", hi.val, idx, hi.map_idx); - let left = self._tree_print(idx * 2 + 1); - let right = self._tree_print(idx * 2 + 2); - let children = left.into_iter().chain(right); - let me = termtree::Tree::new(label).with_leaves(children); - Some(me) + fn _tree_print( + &self, + idx: usize, + prefix: String, + is_tail: bool, + output: &mut String, + ) { + if let Some(Some(hi)) = self.heap.get(idx) { + let connector = if idx != 0 { + if is_tail { + "└── " + } else { + "├── " + } + } else { + "" + }; + output.push_str(&format!( + "{}{}val={:?} idx={}, bucket={}\n", + prefix, connector, hi.val, idx, hi.map_idx + )); + let new_prefix = if is_tail { "" } else { "│ " }; + let child_prefix = format!("{}{}", prefix, new_prefix); + + let left_idx = idx * 2 + 1; + let right_idx = idx * 2 + 2; + + let left_exists = left_idx < self.len; + let right_exists = right_idx < self.len; + + if left_exists { + self._tree_print(left_idx, child_prefix.clone(), !right_exists, output); + } + if right_exists { + self._tree_print(right_idx, child_prefix, true, output); } } } +} - #[cfg(test)] - fn tree_print(&self) -> String { - match self._tree_print(0) { - None => "".to_string(), - Some(root) => format!("{}", root), +impl Display for TopKHeap { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut output = String::new(); + if self.heap.first().is_some() { + self._tree_print(0, String::new(), true, &mut output); } + write!(f, "{}", output) } } @@ -361,9 +385,9 @@ impl HeapItem { impl Debug for HeapItem { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str("bucket=")?; - self.map_idx.fmt(f)?; + Debug::fmt(&self.map_idx, f)?; f.write_str(" val=")?; - self.val.fmt(f)?; + Debug::fmt(&self.val, f)?; f.write_str("\n")?; Ok(()) } @@ -462,7 +486,7 @@ mod tests { let mut heap = TopKHeap::new(10, false); heap.append_or_replace(1, 1, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=1 idx=0, bucket=1 "#; @@ -482,7 +506,7 @@ val=1 idx=0, bucket=1 heap.append_or_replace(2, 2, &mut map); assert_eq!(map, vec![(2, 0), (1, 1)]); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -500,7 +524,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); heap.append_or_replace(3, 3, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=3 idx=0, bucket=3 ├── val=1 idx=1, bucket=1 @@ -510,7 +534,7 @@ val=3 idx=0, bucket=3 let mut map = vec![]; heap.append_or_replace(0, 0, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 ├── val=1 idx=1, bucket=1 @@ -531,7 +555,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(2, 2, &mut map); heap.append_or_replace(3, 3, &mut map); heap.append_or_replace(4, 4, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=4 idx=0, bucket=4 ├── val=3 idx=1, bucket=3 @@ -542,7 +566,7 @@ val=4 idx=0, bucket=4 let mut map = vec![]; heap.replace_if_better(1, 0, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=4 idx=0, bucket=4 ├── val=1 idx=1, bucket=1 @@ -563,7 +587,7 @@ val=4 idx=0, bucket=4 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -584,7 +608,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -607,7 +631,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -616,7 +640,7 @@ val=2 idx=0, bucket=2 let numbers = vec![(0, 1), (1, 2)]; heap.renumber(numbers.as_slice()); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=1 └── val=1 idx=1, bucket=2 From a7041feff32c2af09854c144a760d945e30fb38a Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Sun, 14 Jul 2024 05:47:47 +0800 Subject: [PATCH 18/18] Minor: Add an example for backtrace pretty print (#11450) * add the example for printing backtrace pretty * add empty end line * fix prettier * sync the usage example * Update docs/source/user-guide/crate-configuration.md Co-authored-by: Oleks V --------- Co-authored-by: Oleks V --- docs/source/user-guide/crate-configuration.md | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/crate-configuration.md b/docs/source/user-guide/crate-configuration.md index 0587d06a3919..9d22e3403097 100644 --- a/docs/source/user-guide/crate-configuration.md +++ b/docs/source/user-guide/crate-configuration.md @@ -121,7 +121,7 @@ backtrace: 0: std::backtrace_rs::backtrace::libunwind::trace The backtraces are useful when debugging code. If there is a test in `datafusion/core/src/physical_planner.rs` -``` +```rust #[tokio::test] async fn test_get_backtrace_for_failed_code() -> Result<()> { let ctx = SessionContext::new(); @@ -141,6 +141,48 @@ To obtain a backtrace: ```bash cargo build --features=backtrace RUST_BACKTRACE=1 cargo test --features=backtrace --package datafusion --lib -- physical_planner::tests::test_get_backtrace_for_failed_code --exact --nocapture + +running 1 test +Error: Plan("Invalid function 'row_numer'.\nDid you mean 'ROW_NUMBER'?\n\nbacktrace: 0: std::backtrace_rs::backtrace::libunwind::trace\n at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5\n 1: std::backtrace_rs::backtrace::trace_unsynchronized\n... ``` Note: The backtrace wrapped into systems calls, so some steps on top of the backtrace can be ignored + +To show the backtrace in a pretty-printed format use `eprintln!("{e}");`. + +```rust +#[tokio::test] +async fn test_get_backtrace_for_failed_code() -> Result<()> { + let ctx = SessionContext::new(); + + let sql = "select row_numer() over (partition by a order by a) from (select 1 a);"; + + let _ = match ctx.sql(sql).await { + Ok(result) => result.show().await?, + Err(e) => { + eprintln!("{e}"); + } + }; + + Ok(()) +} +``` + +Then run the test: + +```bash +$ RUST_BACKTRACE=1 cargo test --features=backtrace --package datafusion --lib -- physical_planner::tests::test_get_backtrace_for_failed_code --exact --nocapture + +running 1 test +Error during planning: Invalid function 'row_numer'. +Did you mean 'ROW_NUMBER'? + +backtrace: 0: std::backtrace_rs::backtrace::libunwind::trace + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5 + 1: std::backtrace_rs::backtrace::trace_unsynchronized + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5 + 2: std::backtrace::Backtrace::create + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/backtrace.rs:331:13 + 3: std::backtrace::Backtrace::capture + ... +```