From fa0191772e87e04da2598aedb7fe11dd49f88f88 Mon Sep 17 00:00:00 2001 From: Xin Li <33629085+xinlifoobar@users.noreply.github.com> Date: Tue, 9 Jul 2024 22:12:48 +0900 Subject: [PATCH 01/19] Support `NULL` literals in where clause (#11266) * Try fix where clause incorrectly reject NULL literal * check null in filter --- datafusion/expr/src/logical_plan/plan.rs | 3 +- datafusion/physical-plan/src/filter.rs | 39 +++++++++++++++++---- datafusion/sqllogictest/test_files/misc.slt | 14 ++++++++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index bda03fb7087a..998b5bdcb60c 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2123,7 +2123,8 @@ impl Filter { // construction (such as with correlated subqueries) so we make a best effort here and // ignore errors resolving the expression against the schema. if let Ok(predicate_type) = predicate.get_type(input.schema()) { - if predicate_type != DataType::Boolean { + // Interpret NULL as a missing boolean value. + if predicate_type != DataType::Boolean && predicate_type != DataType::Null { return plan_err!( "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}" ); diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 96ec6c0cf34d..84afc227578f 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -31,13 +31,13 @@ use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, DisplayFormatType, ExecutionPlan, }; - use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; -use datafusion_common::cast::as_boolean_array; +use arrow_array::{Array, BooleanArray}; +use datafusion_common::cast::{as_boolean_array, as_null_array}; use datafusion_common::stats::Precision; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, plan_err, DataFusionError, Result}; use datafusion_execution::TaskContext; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::BinaryExpr; @@ -84,6 +84,19 @@ impl FilterExec { cache, }) } + DataType::Null => { + let default_selectivity = 0; + let cache = + Self::compute_properties(&input, &predicate, default_selectivity)?; + + Ok(Self { + predicate, + input: input.clone(), + metrics: ExecutionPlanMetricsSet::new(), + default_selectivity, + cache, + }) + } other => { plan_err!("Filter predicate must return boolean values, not {other:?}") } @@ -355,9 +368,23 @@ pub(crate) fn batch_filter( .evaluate(batch) .and_then(|v| v.into_array(batch.num_rows())) .and_then(|array| { - Ok(as_boolean_array(&array)?) - // apply filter array to record batch - .and_then(|filter_array| Ok(filter_record_batch(batch, filter_array)?)) + let filter_array = match as_boolean_array(&array) { + Ok(boolean_array) => { + Ok(boolean_array.to_owned()) + }, + Err(_) => { + let Ok(null_array) = as_null_array(&array) else { + return internal_err!("Cannot create filter_array from non-boolean predicates, unable to continute"); + }; + + // if the predicate is null, then the result is also null + Ok::(BooleanArray::new_null( + null_array.len(), + )) + } + }?; + + Ok(filter_record_batch(batch, &filter_array)?) }) } diff --git a/datafusion/sqllogictest/test_files/misc.slt b/datafusion/sqllogictest/test_files/misc.slt index 848cdc943914..66606df83480 100644 --- a/datafusion/sqllogictest/test_files/misc.slt +++ b/datafusion/sqllogictest/test_files/misc.slt @@ -24,3 +24,17 @@ query TT? select 'foo', '', NULL ---- foo (empty) NULL + +# Where clause accept NULL literal +query I +select 1 where NULL +---- + +query I +select 1 where NULL and 1 = 1 +---- + +query I +select 1 where NULL or 1 = 1 +---- +1 \ No newline at end of file From e65c3e919855c9977cf4d80c0630ee26b7fd03ee Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jul 2024 09:13:11 -0400 Subject: [PATCH 02/19] Minor: Add link to blog to main DataFusion website (#11356) --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 8677b63c916a..d491df04f7fe 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,6 +69,7 @@ See the `developer’s guide`_ for contributing and `communication`_ for getting GitHub and Issue Tracker crates.io API Docs + Blog Code of conduct Download From 3d1792fb9f73c6df0ca71e06c89a7bb6e273a740 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:05:59 -0700 Subject: [PATCH 03/19] Implement TPCH substrait integration test, support tpch_6, tpch_10, tpch_11 (#11349) --- .../tests/cases/consumer_integration.rs | 131 ++ .../tpch_substrait_plans/query_10.json | 1257 +++++++++++++++++ .../tpch_substrait_plans/query_11.json | 1059 ++++++++++++++ .../tpch_substrait_plans/query_6.json | 585 ++++++++ 4 files changed, 3032 insertions(+) create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_10.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_11.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_6.json diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 5d565c037852..6133c239873b 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -124,6 +124,56 @@ mod tests { Ok(ctx) } + async fn create_context_tpch6() -> Result { + let ctx = SessionContext::new(); + + let registrations = + vec![("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/lineitem.csv")]; + + for (table_name, file_path) in registrations { + register_csv(&ctx, table_name, file_path).await?; + } + + Ok(ctx) + } + // missing context for query 7,8,9 + + async fn create_context_tpch10() -> Result { + let ctx = SessionContext::new(); + + let registrations = vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/orders.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/nation.csv"), + ]; + + for (table_name, file_path) in registrations { + register_csv(&ctx, table_name, file_path).await?; + } + + Ok(ctx) + } + + async fn create_context_tpch11() -> Result { + let ctx = SessionContext::new(); + + let registrations = vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/partsupp.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/supplier.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/nation.csv"), + ("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/partsupp.csv"), + ("FILENAME_PLACEHOLDER_4", "tests/testdata/tpch/supplier.csv"), + ("FILENAME_PLACEHOLDER_5", "tests/testdata/tpch/nation.csv"), + ]; + + for (table_name, file_path) in registrations { + register_csv(&ctx, table_name, file_path).await?; + } + + Ok(ctx) + } + #[tokio::test] async fn tpch_test_1() -> Result<()> { let ctx = create_context_tpch1().await?; @@ -266,4 +316,85 @@ mod tests { \n TableScan: REGION projection=[r_regionkey, r_name, r_comment]"); Ok(()) } + + #[tokio::test] + async fn tpch_test_6() -> Result<()> { + let ctx = create_context_tpch6().await?; + let path = "tests/testdata/tpch_substrait_plans/query_6.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Aggregate: groupBy=[[]], aggr=[[sum(FILENAME_PLACEHOLDER_0.l_extendedprice * FILENAME_PLACEHOLDER_0.l_discount) AS REVENUE]]\ + \n Projection: FILENAME_PLACEHOLDER_0.l_extendedprice * FILENAME_PLACEHOLDER_0.l_discount\ + \n Filter: FILENAME_PLACEHOLDER_0.l_shipdate >= CAST(Utf8(\"1994-01-01\") AS Date32) AND FILENAME_PLACEHOLDER_0.l_shipdate < CAST(Utf8(\"1995-01-01\") AS Date32) AND FILENAME_PLACEHOLDER_0.l_discount >= Decimal128(Some(5),3,2) AND FILENAME_PLACEHOLDER_0.l_discount <= Decimal128(Some(7),3,2) AND FILENAME_PLACEHOLDER_0.l_quantity < CAST(Int32(24) AS Decimal128(19, 0))\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]"); + Ok(()) + } + + // TODO: missing plan 7, 8, 9 + #[tokio::test] + async fn tpch_test_10() -> Result<()> { + let ctx = create_context_tpch10().await?; + let path = "tests/testdata/tpch_substrait_plans/query_10.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.c_custkey AS C_CUSTKEY, FILENAME_PLACEHOLDER_0.c_name AS C_NAME, sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount) AS REVENUE, FILENAME_PLACEHOLDER_0.c_acctbal AS C_ACCTBAL, FILENAME_PLACEHOLDER_3.n_name AS N_NAME, FILENAME_PLACEHOLDER_0.c_address AS C_ADDRESS, FILENAME_PLACEHOLDER_0.c_phone AS C_PHONE, FILENAME_PLACEHOLDER_0.c_comment AS C_COMMENT\ + \n Limit: skip=0, fetch=20\ + \n Sort: sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount) DESC NULLS FIRST\ + \n Projection: FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_0.c_name, sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount), FILENAME_PLACEHOLDER_0.c_acctbal, FILENAME_PLACEHOLDER_3.n_name, FILENAME_PLACEHOLDER_0.c_address, FILENAME_PLACEHOLDER_0.c_phone, FILENAME_PLACEHOLDER_0.c_comment\n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_0.c_name, FILENAME_PLACEHOLDER_0.c_acctbal, FILENAME_PLACEHOLDER_0.c_phone, FILENAME_PLACEHOLDER_3.n_name, FILENAME_PLACEHOLDER_0.c_address, FILENAME_PLACEHOLDER_0.c_comment]], aggr=[[sum(FILENAME_PLACEHOLDER_2.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_2.l_discount)]]\ + \n Projection: FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_0.c_name, FILENAME_PLACEHOLDER_0.c_acctbal, FILENAME_PLACEHOLDER_0.c_phone, FILENAME_PLACEHOLDER_3.n_name, FILENAME_PLACEHOLDER_0.c_address, FILENAME_PLACEHOLDER_0.c_comment, FILENAME_PLACEHOLDER_2.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_2.l_discount)\ + \n Filter: FILENAME_PLACEHOLDER_0.c_custkey = FILENAME_PLACEHOLDER_1.o_custkey AND FILENAME_PLACEHOLDER_2.l_orderkey = FILENAME_PLACEHOLDER_1.o_orderkey AND FILENAME_PLACEHOLDER_1.o_orderdate >= CAST(Utf8(\"1993-10-01\") AS Date32) AND FILENAME_PLACEHOLDER_1.o_orderdate < CAST(Utf8(\"1994-01-01\") AS Date32) AND FILENAME_PLACEHOLDER_2.l_returnflag = Utf8(\"R\") AND FILENAME_PLACEHOLDER_0.c_nationkey = FILENAME_PLACEHOLDER_3.n_nationkey\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_3 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_11() -> Result<()> { + let ctx = create_context_tpch11().await?; + let path = "tests/testdata/tpch_substrait_plans/query_11.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.ps_partkey AS PS_PARTKEY, sum(FILENAME_PLACEHOLDER_0.ps_supplycost * FILENAME_PLACEHOLDER_0.ps_availqty) AS value\ + \n Sort: sum(FILENAME_PLACEHOLDER_0.ps_supplycost * FILENAME_PLACEHOLDER_0.ps_availqty) DESC NULLS FIRST\ + \n Filter: sum(FILENAME_PLACEHOLDER_0.ps_supplycost * FILENAME_PLACEHOLDER_0.ps_availqty) > ()\ + \n Subquery:\ + \n Projection: sum(FILENAME_PLACEHOLDER_3.ps_supplycost * FILENAME_PLACEHOLDER_3.ps_availqty) * Decimal128(Some(1000000),11,10)\ + \n Aggregate: groupBy=[[]], aggr=[[sum(FILENAME_PLACEHOLDER_3.ps_supplycost * FILENAME_PLACEHOLDER_3.ps_availqty)]]\ + \n Projection: FILENAME_PLACEHOLDER_3.ps_supplycost * CAST(FILENAME_PLACEHOLDER_3.ps_availqty AS Decimal128(19, 0))\ + \n Filter: FILENAME_PLACEHOLDER_3.ps_suppkey = FILENAME_PLACEHOLDER_4.s_suppkey AND FILENAME_PLACEHOLDER_4.s_nationkey = FILENAME_PLACEHOLDER_5.n_nationkey AND FILENAME_PLACEHOLDER_5.n_name = CAST(Utf8(\"JAPAN\") AS Utf8)\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_3 projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_4 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_5 projection=[n_nationkey, n_name, n_regionkey, n_comment]\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.ps_partkey]], aggr=[[sum(FILENAME_PLACEHOLDER_0.ps_supplycost * FILENAME_PLACEHOLDER_0.ps_availqty)]]\ + \n Projection: FILENAME_PLACEHOLDER_0.ps_partkey, FILENAME_PLACEHOLDER_0.ps_supplycost * CAST(FILENAME_PLACEHOLDER_0.ps_availqty AS Decimal128(19, 0))\ + \n Filter: FILENAME_PLACEHOLDER_0.ps_suppkey = FILENAME_PLACEHOLDER_1.s_suppkey AND FILENAME_PLACEHOLDER_1.s_nationkey = FILENAME_PLACEHOLDER_2.n_nationkey AND FILENAME_PLACEHOLDER_2.n_name = CAST(Utf8(\"JAPAN\") AS Utf8)\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); + Ok(()) + } } diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_10.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_10.json new file mode 100644 index 000000000000..04e13b1edc27 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_10.json @@ -0,0 +1,1257 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gte:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "lt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "multiply:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "subtract:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "sum:opt_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "fetch": { + "common": { + "direct": { + } + }, + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "N_NATIONKEY", + "N_NAME", + "N_REGIONKEY", + "N_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 152, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_3", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 17 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1993-10-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1994-01-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 25 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "R", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 33 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 34 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + }, + { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 23 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + } + ] + } + }, + "offset": "0", + "count": "20" + } + }, + "names": [ + "C_CUSTKEY", + "C_NAME", + "REVENUE", + "C_ACCTBAL", + "N_NAME", + "C_ADDRESS", + "C_PHONE", + "C_COMMENT" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_11.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_11.json new file mode 100644 index 000000000000..916bc6f71c2c --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_11.json @@ -0,0 +1,1059 @@ +{ + "extensionUris": [{ + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, { + "extensionUriAnchor": 3, + "uri": "/functions_arithmetic_decimal.yaml" + }, { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "multiply:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "sum:opt_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 4, + "name": "gt:any1_any1" + } + }], + "relations": [{ + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [16, 17] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["PS_PARTKEY", "PS_SUPPKEY", "PS_AVAILQTY", "PS_SUPPLYCOST", "PS_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 199, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_NATIONKEY", "S_PHONE", "S_ACCTBAL", "S_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["N_NATIONKEY", "N_NAME", "N_REGIONKEY", "N_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "varchar": { + "length": 152, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 13 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "JAPAN", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }] + } + } + }] + } + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }] + } + }] + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }] + }], + "measures": [{ + "measure": { + "functionReference": 3, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }] + } + }] + } + }, + "condition": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "subquery": { + "scalar": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [1] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [16] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["PS_PARTKEY", "PS_SUPPKEY", "PS_AVAILQTY", "PS_SUPPLYCOST", "PS_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 199, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_3", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_NATIONKEY", "S_PHONE", "S_ACCTBAL", "S_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_4", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["N_NATIONKEY", "N_NAME", "N_REGIONKEY", "N_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "varchar": { + "length": 152, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_5", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + } + }] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 13 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "JAPAN", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }] + } + } + }] + } + } + } + }, + "expressions": [{ + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }] + } + }] + } + }, + "groupings": [{ + "groupingExpressions": [] + }], + "measures": [{ + "measure": { + "functionReference": 3, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }] + } + }] + } + }, + "expressions": [{ + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 10, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "literal": { + "decimal": { + "value": "QEIPAAAAAAAAAAAAAAAAAA==", + "precision": 11, + "scale": 10 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + }] + } + }] + } + } + } + } + } + }] + } + } + } + }, + "sorts": [{ + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }] + } + }, + "names": ["PS_PARTKEY", "value"] + } + }], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_6.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_6.json new file mode 100644 index 000000000000..18fb9781da55 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_6.json @@ -0,0 +1,585 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "gte:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "lt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "gte:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 4, + "name": "lte:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 5, + "name": "lt:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "multiply:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 7, + "name": "sum:opt_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "aggregate": { + "common": { + "direct": {} + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 16 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1994-01-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1995-01-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "literal": { + "decimal": { + "value": "BQAAAAAAAAAAAAAAAAAAAA==", + "precision": 3, + "scale": 2 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "literal": { + "decimal": { + "value": "BwAAAAAAAAAAAAAAAAAAAA==", + "precision": 3, + "scale": 2 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 24, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": {} + } + } + } + ] + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + } + ] + } + } + ] + } + }, + "names": [ + "REVENUE" + ] + } + } + ], + "expectedTypeUrls": [] +} \ No newline at end of file From 6cca0f7f3725406aef4deb1ff1bbe299867ce82c Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Wed, 10 Jul 2024 05:08:22 +0800 Subject: [PATCH 04/19] Fix bug when pushing projection under joins (#11333) * Fix bug in `ProjectionPushdown` * add order by * Fix join on --- .../physical_optimizer/projection_pushdown.rs | 50 ++++++++++------ datafusion/sqllogictest/test_files/join.slt | 58 +++++++++++++++++++ 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 70524dfcea7d..3c2be59f7504 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -46,7 +46,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; -use datafusion_common::{DataFusionError, JoinSide}; +use datafusion_common::{internal_err, JoinSide}; use datafusion_physical_expr::expressions::{Column, Literal}; use datafusion_physical_expr::{ utils::collect_columns, Partitioning, PhysicalExpr, PhysicalExprRef, @@ -640,6 +640,7 @@ fn try_pushdown_through_hash_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], hash_join.on(), + hash_join.left().schema().fields().len(), ) else { return Ok(None); }; @@ -649,8 +650,7 @@ fn try_pushdown_through_hash_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], filter, - hash_join.left(), - hash_join.right(), + hash_join.left().schema().fields().len(), ) { Some(updated_filter) => Some(updated_filter), None => return Ok(None), @@ -750,8 +750,7 @@ fn try_swapping_with_nested_loop_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], filter, - nl_join.left(), - nl_join.right(), + nl_join.left().schema().fields().len(), ) { Some(updated_filter) => Some(updated_filter), None => return Ok(None), @@ -806,6 +805,7 @@ fn try_swapping_with_sort_merge_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], sm_join.on(), + sm_join.left().schema().fields().len(), ) else { return Ok(None); }; @@ -859,6 +859,7 @@ fn try_swapping_with_sym_hash_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], sym_join.on(), + sym_join.left().schema().fields().len(), ) else { return Ok(None); }; @@ -868,8 +869,7 @@ fn try_swapping_with_sym_hash_join( &projection_as_columns[0..=far_right_left_col_ind as _], &projection_as_columns[far_left_right_col_ind as _..], filter, - sym_join.left(), - sym_join.right(), + sym_join.left().schema().fields().len(), ) { Some(updated_filter) => Some(updated_filter), None => return Ok(None), @@ -1090,6 +1090,7 @@ fn update_join_on( proj_left_exprs: &[(Column, String)], proj_right_exprs: &[(Column, String)], hash_join_on: &[(PhysicalExprRef, PhysicalExprRef)], + left_field_size: usize, ) -> Option> { // TODO: Clippy wants the "map" call removed, but doing so generates // a compilation error. Remove the clippy directive once this @@ -1100,8 +1101,9 @@ fn update_join_on( .map(|(left, right)| (left, right)) .unzip(); - let new_left_columns = new_columns_for_join_on(&left_idx, proj_left_exprs); - let new_right_columns = new_columns_for_join_on(&right_idx, proj_right_exprs); + let new_left_columns = new_columns_for_join_on(&left_idx, proj_left_exprs, 0); + let new_right_columns = + new_columns_for_join_on(&right_idx, proj_right_exprs, left_field_size); match (new_left_columns, new_right_columns) { (Some(left), Some(right)) => Some(left.into_iter().zip(right).collect()), @@ -1112,9 +1114,14 @@ fn update_join_on( /// This function generates a new set of columns to be used in a hash join /// operation based on a set of equi-join conditions (`hash_join_on`) and a /// list of projection expressions (`projection_exprs`). +/// +/// Notes: Column indices in the projection expressions are based on the join schema, +/// whereas the join on expressions are based on the join child schema. `column_index_offset` +/// represents the offset between them. fn new_columns_for_join_on( hash_join_on: &[&PhysicalExprRef], projection_exprs: &[(Column, String)], + column_index_offset: usize, ) -> Option> { let new_columns = hash_join_on .iter() @@ -1130,6 +1137,8 @@ fn new_columns_for_join_on( .enumerate() .find(|(_, (proj_column, _))| { column.name() == proj_column.name() + && column.index() + column_index_offset + == proj_column.index() }) .map(|(index, (_, alias))| Column::new(alias, index)); if let Some(new_column) = new_column { @@ -1138,10 +1147,10 @@ fn new_columns_for_join_on( // If the column is not found in the projection expressions, // it means that the column is not projected. In this case, // we cannot push the projection down. - Err(DataFusionError::Internal(format!( + internal_err!( "Column {:?} not found in projection expressions", column - ))) + ) } } else { Ok(Transformed::no(expr)) @@ -1160,21 +1169,20 @@ fn update_join_filter( projection_left_exprs: &[(Column, String)], projection_right_exprs: &[(Column, String)], join_filter: &JoinFilter, - join_left: &Arc, - join_right: &Arc, + left_field_size: usize, ) -> Option { let mut new_left_indices = new_indices_for_join_filter( join_filter, JoinSide::Left, projection_left_exprs, - join_left.schema(), + 0, ) .into_iter(); let mut new_right_indices = new_indices_for_join_filter( join_filter, JoinSide::Right, projection_right_exprs, - join_right.schema(), + left_field_size, ) .into_iter(); @@ -1204,20 +1212,24 @@ fn update_join_filter( /// This function determines and returns a vector of indices representing the /// positions of columns in `projection_exprs` that are involved in `join_filter`, /// and correspond to a particular side (`join_side`) of the join operation. +/// +/// Notes: Column indices in the projection expressions are based on the join schema, +/// whereas the join filter is based on the join child schema. `column_index_offset` +/// represents the offset between them. fn new_indices_for_join_filter( join_filter: &JoinFilter, join_side: JoinSide, projection_exprs: &[(Column, String)], - join_child_schema: SchemaRef, + column_index_offset: usize, ) -> Vec { join_filter .column_indices() .iter() .filter(|col_idx| col_idx.side == join_side) .filter_map(|col_idx| { - projection_exprs.iter().position(|(col, _)| { - col.name() == join_child_schema.fields()[col_idx.index].name() - }) + projection_exprs + .iter() + .position(|(col, _)| col_idx.index + column_index_offset == col.index()) }) .collect() } diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 3c89109145d7..12cb8b3985c7 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -986,3 +986,61 @@ DROP TABLE employees statement ok DROP TABLE department + + +# Test issue: https://github.com/apache/datafusion/issues/11269 +statement ok +CREATE TABLE t1 (v0 BIGINT) AS VALUES (-503661263); + +statement ok +CREATE TABLE t2 (v0 DOUBLE) AS VALUES (-1.663563947387); + +statement ok +CREATE TABLE t3 (v0 DOUBLE) AS VALUES (0.05112015193508901); + +query RR +SELECT t3.v0, t2.v0 FROM t1,t2,t3 WHERE t3.v0 >= t1.v0; +---- +0.051120151935 -1.663563947387 + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; + +statement ok +DROP TABLE t3; + + +# Test issue: https://github.com/apache/datafusion/issues/11275 +statement ok +CREATE TABLE t0 (v1 BOOLEAN) AS VALUES (false), (null); + +statement ok +CREATE TABLE t1 (v1 BOOLEAN) AS VALUES (false), (null), (false); + +statement ok +CREATE TABLE t2 (v1 BOOLEAN) AS VALUES (false), (true); + +query BB +SELECT t2.v1, t1.v1 FROM t0, t1, t2 WHERE t2.v1 IS DISTINCT FROM t0.v1 ORDER BY 1,2; +---- +false false +false false +false NULL +true false +true false +true false +true false +true NULL +true NULL + +statement ok +DROP TABLE t0; + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; From 7f25d9dd63918ecfeecaa5810d2a5c4fc9155c5d Mon Sep 17 00:00:00 2001 From: Oleks V Date: Tue, 9 Jul 2024 14:09:57 -0700 Subject: [PATCH 05/19] Minor: some cosmetics in `filter.rs`, fix clippy due to logical conflict (#11368) * Minor: some cosmetics in `filter.rs` * Minor: some cosmetics in `filter.rs` --- datafusion/physical-plan/src/filter.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 84afc227578f..c5ba3992d3b4 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -15,9 +15,6 @@ // specific language governing permissions and limitations // under the License. -//! FilterExec evaluates a boolean predicate against all input batches to determine which rows to -//! include in its output batches. - use std::any::Any; use std::pin::Pin; use std::sync::Arc; @@ -60,7 +57,7 @@ pub struct FilterExec { input: Arc, /// Execution metrics metrics: ExecutionPlanMetricsSet, - /// Selectivity for statistics. 0 = no rows, 100 all rows + /// Selectivity for statistics. 0 = no rows, 100 = all rows default_selectivity: u8, cache: PlanProperties, } @@ -91,14 +88,14 @@ impl FilterExec { Ok(Self { predicate, - input: input.clone(), + input: Arc::clone(&input), metrics: ExecutionPlanMetricsSet::new(), default_selectivity, cache, }) } other => { - plan_err!("Filter predicate must return boolean values, not {other:?}") + plan_err!("Filter predicate must return BOOLEAN values, got {other:?}") } } } @@ -108,7 +105,9 @@ impl FilterExec { default_selectivity: u8, ) -> Result { if default_selectivity > 100 { - return plan_err!("Default filter selectivity needs to be less than 100"); + return plan_err!( + "Default filter selectivity value needs to be less than or equal to 100" + ); } self.default_selectivity = default_selectivity; Ok(self) @@ -369,12 +368,12 @@ pub(crate) fn batch_filter( .and_then(|v| v.into_array(batch.num_rows())) .and_then(|array| { let filter_array = match as_boolean_array(&array) { - Ok(boolean_array) => { - Ok(boolean_array.to_owned()) - }, + Ok(boolean_array) => Ok(boolean_array.to_owned()), Err(_) => { let Ok(null_array) = as_null_array(&array) else { - return internal_err!("Cannot create filter_array from non-boolean predicates, unable to continute"); + return internal_err!( + "Cannot create filter_array from non-boolean predicates" + ); }; // if the predicate is null, then the result is also null From 9df393ea5539d5c83f8c16b028f44468727e3bee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:11:48 -0400 Subject: [PATCH 06/19] Update prost-derive requirement from 0.12 to 0.13 (#11355) Updates the requirements on [prost-derive](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Changelog](https://github.com/tokio-rs/prost/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/prost/compare/v0.12.0...v0.13.0) --- updated-dependencies: - dependency-name: prost-derive dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- datafusion-examples/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 52e3a5525717..626c365af21c 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -73,7 +73,7 @@ mimalloc = { version = "0.1", default-features = false } num_cpus = { workspace = true } object_store = { workspace = true, features = ["aws", "http"] } prost = { version = "0.12", default-features = false } -prost-derive = { version = "0.12", default-features = false } +prost-derive = { version = "0.13", default-features = false } serde = { version = "1.0.136", features = ["derive"] } serde_json = { workspace = true } tempfile = { workspace = true } From c018c74ae1ca2339bd530e5e6724e45a16e3900b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jul 2024 17:44:39 -0400 Subject: [PATCH 07/19] Minor: update dashmap (#11335) --- Cargo.toml | 2 +- datafusion-cli/Cargo.lock | 43 +++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f87205f0d067..6dd434abc87c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -85,7 +85,7 @@ bigdecimal = "=0.4.1" bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" -dashmap = "5.5.0" +dashmap = "6.0.1" datafusion = { path = "datafusion/core", version = "40.0.0", default-features = false } datafusion-common = { path = "datafusion/common", version = "40.0.0", default-features = false } datafusion-common-runtime = { path = "datafusion/common-runtime", version = "40.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 42ec5922a73f..8af42cb43932 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -443,7 +443,7 @@ dependencies = [ "fastrand 1.9.0", "hex", "http 0.2.12", - "hyper 0.14.29", + "hyper 0.14.30", "ring 0.16.20", "time", "tokio", @@ -609,7 +609,7 @@ dependencies = [ "fastrand 1.9.0", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.29", + "hyper 0.14.30", "hyper-rustls 0.23.2", "lazy_static", "pin-project-lite", @@ -631,7 +631,7 @@ dependencies = [ "futures-core", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.29", + "hyper 0.14.30", "once_cell", "percent-encoding", "pin-project-lite", @@ -875,9 +875,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.106" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2" +checksum = "eaff6f8ce506b9773fa786672d63fc7a191ffea1be33f72bbd4aeacefca9ffc8" dependencies = [ "jobserver", "libc", @@ -1055,6 +1055,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crunchy" version = "0.2.2" @@ -1110,11 +1116,12 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "5.5.3" +version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" dependencies = [ "cfg-if", + "crossbeam-utils", "hashbrown 0.14.5", "lock_api", "once_cell", @@ -1941,9 +1948,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.29" +version = "0.14.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" +checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" dependencies = [ "bytes", "futures-channel", @@ -1965,9 +1972,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4fe55fb7a772d59a5ff1dfbff4fe0258d19b89fec4b233e75d35d5d2316badc" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", @@ -1990,7 +1997,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" dependencies = [ "http 0.2.12", - "hyper 0.14.29", + "hyper 0.14.30", "log", "rustls 0.20.9", "rustls-native-certs 0.6.3", @@ -2006,7 +2013,7 @@ checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.4.0", + "hyper 1.4.1", "hyper-util", "rustls 0.23.11", "rustls-native-certs 0.7.1", @@ -2027,7 +2034,7 @@ dependencies = [ "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.4.0", + "hyper 1.4.1", "pin-project-lite", "socket2", "tokio", @@ -2502,7 +2509,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.4.0", + "hyper 1.4.1", "itertools", "md-5", "parking_lot", @@ -2976,7 +2983,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.4.0", + "hyper 1.4.1", "hyper-rustls 0.27.2", "hyper-util", "ipnet", @@ -3908,9 +3915,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", "serde", From 1e0c06e14ae821ac6aa344f8acb638431a898ae8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jul 2024 17:45:34 -0400 Subject: [PATCH 08/19] Improve and test dataframe API examples in docs (#11290) * Improve and test dataframe API examples in docs * Update introduction with pointer to user guide * Make example consistent * Make read_csv comment consistent * clarifications * prettier + tweaks * Update docs/source/library-user-guide/using-the-dataframe-api.md Co-authored-by: Eric Fredine * Update docs/source/library-user-guide/using-the-dataframe-api.md Co-authored-by: Eric Fredine --------- Co-authored-by: Eric Fredine --- datafusion-examples/README.md | 1 + datafusion/core/src/lib.rs | 8 +- .../using-the-dataframe-api.md | 302 +++++++++++++----- .../library-user-guide/using-the-sql-api.md | 17 +- 4 files changed, 239 insertions(+), 89 deletions(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 90469e6715a6..2696f74775cf 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -55,6 +55,7 @@ cargo run --example dataframe - [`composed_extension_codec`](examples/composed_extension_codec.rs): Example of using multiple extension codecs for serialization / deserialization - [`csv_sql_streaming.rs`](examples/csv_sql_streaming.rs): Build and run a streaming query plan from a SQL statement against a local CSV file - [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) +- [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 - [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame against a local parquet file - [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 956e9f7246a3..f5805bc06982 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -641,5 +641,11 @@ doc_comment::doctest!( #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/library-user-guide/using-the-sql-api.md", - library_user_guide_example_usage + library_user_guide_sql_api +); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/library-user-guide/using-the-dataframe-api.md", + library_user_guide_dataframe_api ); diff --git a/docs/source/library-user-guide/using-the-dataframe-api.md b/docs/source/library-user-guide/using-the-dataframe-api.md index c4f4ecd4f137..9e7774cbb944 100644 --- a/docs/source/library-user-guide/using-the-dataframe-api.md +++ b/docs/source/library-user-guide/using-the-dataframe-api.md @@ -19,129 +19,267 @@ # Using the DataFrame API -## What is a DataFrame +The [Users Guide] introduces the [`DataFrame`] API and this section describes +that API in more depth. -`DataFrame` in `DataFrame` is modeled after the Pandas DataFrame interface, and is a thin wrapper over LogicalPlan that adds functionality for building and executing those plans. +## What is a DataFrame? -```rust -pub struct DataFrame { - session_state: SessionState, - plan: LogicalPlan, -} -``` - -You can build up `DataFrame`s using its methods, similarly to building `LogicalPlan`s using `LogicalPlanBuilder`: - -```rust -let df = ctx.table("users").await?; +As described in the [Users Guide], DataFusion [`DataFrame`]s are modeled after +the [Pandas DataFrame] interface, and are implemented as thin wrapper over a +[`LogicalPlan`] that adds functionality for building and executing those plans. -// Create a new DataFrame sorted by `id`, `bank_account` -let new_df = df.select(vec![col("id"), col("bank_account")])? - .sort(vec![col("id")])?; - -// Build the same plan using the LogicalPlanBuilder -let plan = LogicalPlanBuilder::from(&df.to_logical_plan()) - .project(vec![col("id"), col("bank_account")])? - .sort(vec![col("id")])? - .build()?; -``` - -You can use `collect` or `execute_stream` to execute the query. +The simplest possible dataframe is one that scans a table and that table can be +in a file or in memory. ## How to generate a DataFrame -You can directly use the `DataFrame` API or generate a `DataFrame` from a SQL query. - -For example, to use `sql` to construct `DataFrame`: +You can construct [`DataFrame`]s programmatically using the API, similarly to +other DataFrame APIs. For example, you can read an in memory `RecordBatch` into +a `DataFrame`: ```rust -let ctx = SessionContext::new(); -// Register the in-memory table containing the data -ctx.register_table("users", Arc::new(create_memtable()?))?; -let dataframe = ctx.sql("SELECT * FROM users;").await?; +use std::sync::Arc; +use datafusion::prelude::*; +use datafusion::arrow::array::{ArrayRef, Int32Array}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::Result; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // Register an in-memory table containing the following data + // id | bank_account + // ---|------------- + // 1 | 9000 + // 2 | 8000 + // 3 | 7000 + let data = RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), + ("bank_account", Arc::new(Int32Array::from(vec![9000, 8000, 7000]))), + ])?; + // Create a DataFrame that scans the user table, and finds + // all users with a bank account at least 8000 + // and sorts the results by bank account in descending order + let dataframe = ctx + .read_batch(data)? + .filter(col("bank_account").gt_eq(lit(8000)))? // bank_account >= 8000 + .sort(vec![col("bank_account").sort(false, true)])?; // ORDER BY bank_account DESC + + Ok(()) +} ``` -To construct `DataFrame` using the API: +You can _also_ generate a `DataFrame` from a SQL query and use the DataFrame's APIs +to manipulate the output of the query. ```rust -let ctx = SessionContext::new(); -// Register the in-memory table containing the data -ctx.register_table("users", Arc::new(create_memtable()?))?; -let dataframe = ctx - .table("users") - .filter(col("a").lt_eq(col("b")))? - .sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?; +use std::sync::Arc; +use datafusion::prelude::*; +use datafusion::assert_batches_eq; +use datafusion::arrow::array::{ArrayRef, Int32Array}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::Result; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // Register the same in-memory table as the previous example + let data = RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), + ("bank_account", Arc::new(Int32Array::from(vec![9000, 8000, 7000]))), + ])?; + ctx.register_batch("users", data)?; + // Create a DataFrame using SQL + let dataframe = ctx.sql("SELECT * FROM users;") + .await? + // Note we can filter the output of the query using the DataFrame API + .filter(col("bank_account").gt_eq(lit(8000)))?; // bank_account >= 8000 + + let results = &dataframe.collect().await?; + + // use the `assert_batches_eq` macro to show the output + assert_batches_eq!( + vec![ + "+----+--------------+", + "| id | bank_account |", + "+----+--------------+", + "| 1 | 9000 |", + "| 2 | 8000 |", + "+----+--------------+", + ], + &results + ); + Ok(()) +} ``` ## Collect / Streaming Exec -DataFusion `DataFrame`s are "lazy", meaning they do not do any processing until they are executed, which allows for additional optimizations. +DataFusion [`DataFrame`]s are "lazy", meaning they do no processing until +they are executed, which allows for additional optimizations. -When you have a `DataFrame`, you can run it in one of three ways: +You can run a `DataFrame` in one of three ways: -1. `collect` which executes the query and buffers all the output into a `Vec` -2. `streaming_exec`, which begins executions and returns a `SendableRecordBatchStream` which incrementally computes output on each call to `next()` -3. `cache` which executes the query and buffers the output into a new in memory DataFrame. +1. `collect`: executes the query and buffers all the output into a `Vec` +2. `execute_stream`: begins executions and returns a `SendableRecordBatchStream` which incrementally computes output on each call to `next()` +3. `cache`: executes the query and buffers the output into a new in memory `DataFrame.` -You can just collect all outputs once like: +To collect all outputs into a memory buffer, use the `collect` method: ```rust -let ctx = SessionContext::new(); -let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -let batches = df.collect().await?; +use datafusion::prelude::*; +use datafusion::error::Result; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // read the contents of a CSV file into a DataFrame + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + // execute the query and collect the results as a Vec + let batches = df.collect().await?; + for record_batch in batches { + println!("{record_batch:?}"); + } + Ok(()) +} ``` -You can also use stream output to incrementally generate output one `RecordBatch` at a time +Use `execute_stream` to incrementally generate output one `RecordBatch` at a time: ```rust -let ctx = SessionContext::new(); -let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -let mut stream = df.execute_stream().await?; -while let Some(rb) = stream.next().await { - println!("{rb:?}"); +use datafusion::prelude::*; +use datafusion::error::Result; +use futures::stream::StreamExt; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // read example.csv file into a DataFrame + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + // begin execution (returns quickly, does not compute results) + let mut stream = df.execute_stream().await?; + // results are returned incrementally as they are computed + while let Some(record_batch) = stream.next().await { + println!("{record_batch:?}"); + } + Ok(()) } ``` # Write DataFrame to Files -You can also serialize `DataFrame` to a file. For now, `Datafusion` supports write `DataFrame` to `csv`, `json` and `parquet`. - -When writing a file, DataFusion will execute the DataFrame and stream the results to a file. +You can also write the contents of a `DataFrame` to a file. When writing a file, +DataFusion executes the `DataFrame` and streams the results to the output. +DataFusion comes with support for writing `csv`, `json` `arrow` `avro`, and +`parquet` files, and supports writing custom file formats via API (see +[`custom_file_format.rs`] for an example) -For example, to write a csv_file +For example, to read a CSV file and write it to a parquet file, use the +[`DataFrame::write_parquet`] method ```rust -let ctx = SessionContext::new(); -// Register the in-memory table containing the data -ctx.register_table("users", Arc::new(mem_table))?; -let dataframe = ctx.sql("SELECT * FROM users;").await?; - -dataframe - .write_csv("user_dataframe.csv", DataFrameWriteOptions::default(), None) - .await; +use datafusion::prelude::*; +use datafusion::error::Result; +use datafusion::dataframe::DataFrameWriteOptions; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + // read example.csv file into a DataFrame + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + // stream the contents of the DataFrame to the `example.parquet` file + df.write_parquet( + "example.parquet", + DataFrameWriteOptions::new(), + None, // writer_options + ).await; + Ok(()) +} ``` -and the file will look like (Example Output): +[`custom_file_format.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_file_format.rs -``` -id,bank_account -1,9000 +The output file will look like (Example Output): + +```sql +> select * from '../datafusion/core/example.parquet'; ++---+---+---+ +| a | b | c | ++---+---+---+ +| 1 | 2 | 3 | ++---+---+---+ ``` -## Transform between LogicalPlan and DataFrame +## Relationship between `LogicalPlan`s and `DataFrame`s -As shown above, `DataFrame` is just a very thin wrapper of `LogicalPlan`, so you can easily go back and forth between them. +The `DataFrame` struct is defined like this: ```rust -// Just combine LogicalPlan with SessionContext and you get a DataFrame -let ctx = SessionContext::new(); -// Register the in-memory table containing the data -ctx.register_table("users", Arc::new(mem_table))?; -let dataframe = ctx.sql("SELECT * FROM users;").await?; +use datafusion::execution::session_state::SessionState; +use datafusion::logical_expr::LogicalPlan; +pub struct DataFrame { + // state required to execute a LogicalPlan + session_state: Box, + // LogicalPlan that describes the computation to perform + plan: LogicalPlan, +} +``` -// get LogicalPlan in dataframe -let plan = dataframe.logical_plan().clone(); +As shown above, `DataFrame` is a thin wrapper of `LogicalPlan`, so you can +easily go back and forth between them. -// construct a DataFrame with LogicalPlan -let new_df = DataFrame::new(ctx.state(), plan); +```rust +use datafusion::prelude::*; +use datafusion::error::Result; +use datafusion::logical_expr::LogicalPlanBuilder; + +#[tokio::main] +async fn main() -> Result<()>{ + let ctx = SessionContext::new(); + // read example.csv file into a DataFrame + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + // You can easily get the LogicalPlan from the DataFrame + let (_state, plan) = df.into_parts(); + // Just combine LogicalPlan with SessionContext and you get a DataFrame + // get LogicalPlan in dataframe + let new_df = DataFrame::new(ctx.state(), plan); + Ok(()) +} ``` + +In fact, using the [`DataFrame`]s methods you can create the same +[`LogicalPlan`]s as when using [`LogicalPlanBuilder`]: + +```rust +use datafusion::prelude::*; +use datafusion::error::Result; +use datafusion::logical_expr::LogicalPlanBuilder; + +#[tokio::main] +async fn main() -> Result<()>{ + let ctx = SessionContext::new(); + // read example.csv file into a DataFrame + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + // Create a new DataFrame sorted by `id`, `bank_account` + let new_df = df.select(vec![col("a"), col("b")])? + .sort(vec![col("a")])?; + // Build the same plan using the LogicalPlanBuilder + // Similar to `SELECT a, b FROM example.csv ORDER BY a` + let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + let (_state, plan) = df.into_parts(); // get the DataFrame's LogicalPlan + let plan = LogicalPlanBuilder::from(plan) + .project(vec![col("a"), col("b")])? + .sort(vec![col("a")])? + .build()?; + // prove they are the same + assert_eq!(new_df.logical_plan(), &plan); + Ok(()) +} +``` + +[users guide]: ../user-guide/dataframe.md +[pandas dataframe]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html +[`dataframe`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html +[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html +[`logicalplanbuilder`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.LogicalPlanBuilder.html +[`dataframe::write_parquet`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.write_parquet diff --git a/docs/source/library-user-guide/using-the-sql-api.md b/docs/source/library-user-guide/using-the-sql-api.md index 1a25f078cc2e..9c32004db435 100644 --- a/docs/source/library-user-guide/using-the-sql-api.md +++ b/docs/source/library-user-guide/using-the-sql-api.md @@ -29,16 +29,15 @@ using the [`SessionContext::sql`] method. For lower level control such as preventing DDL, you can use [`SessionContext::sql_with_options`] or the [`SessionState`] APIs -[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html -[`sessioncontext::sql`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql -[`sessioncontext::sql_with_options`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql_with_options -[`sessionstate`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html - ## Registering Data Sources using `SessionContext::register*` The `SessionContext::register*` methods tell DataFusion the name of the source and how to read data. Once registered, you can execute SQL queries -using the `SessionContext::sql` method referring to your data source as a table. +using the [`SessionContext::sql`] method referring to your data source as a table. + +The [`SessionContext::sql`] method returns a `DataFrame` for ease of +use. See the ["Using the DataFrame API"] section for more information on how to +work with DataFrames. ### Read a CSV File @@ -215,3 +214,9 @@ async fn main() -> Result<()> { Ok(()) } ``` + +[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html +[`sessioncontext::sql`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql +[`sessioncontext::sql_with_options`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql_with_options +[`sessionstate`]: https://docs.rs/datafusion/latest/datafusion/execution/session_state/struct.SessionState.html +["using the dataframe api"]: ../library-user-guide/using-the-dataframe-api.md From 16a3148354e81e1ae4e2aebdd83c07799164ac14 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Jul 2024 21:30:28 -0400 Subject: [PATCH 09/19] Remove redundant `unalias_nested` calls for creating Filter's (#11340) * Remove uncessary unalias_nested calls when creating Filter * simplify --- datafusion/expr/src/logical_plan/plan.rs | 54 ++++--------------- .../optimizer/src/common_subexpr_eliminate.rs | 11 +--- datafusion/optimizer/src/push_down_filter.rs | 4 +- 3 files changed, 13 insertions(+), 56 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 998b5bdcb60c..bde9655b8a39 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -41,9 +41,7 @@ use crate::{ }; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion_common::tree_node::{ - Transformed, TransformedResult, TreeNode, TreeNodeRecursion, -}; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; use datafusion_common::{ aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, @@ -645,39 +643,6 @@ impl LogicalPlan { Ok(LogicalPlan::Values(Values { schema, values })) } LogicalPlan::Filter(Filter { predicate, input }) => { - // todo: should this logic be moved to Filter::try_new? - - // filter predicates should not contain aliased expressions so we remove any aliases - // before this logic was added we would have aliases within filters such as for - // benchmark q6: - // - // lineitem.l_shipdate >= Date32(\"8766\") - // AND lineitem.l_shipdate < Date32(\"9131\") - // AND CAST(lineitem.l_discount AS Decimal128(30, 15)) AS lineitem.l_discount >= - // Decimal128(Some(49999999999999),30,15) - // AND CAST(lineitem.l_discount AS Decimal128(30, 15)) AS lineitem.l_discount <= - // Decimal128(Some(69999999999999),30,15) - // AND lineitem.l_quantity < Decimal128(Some(2400),15,2) - - let predicate = predicate - .transform_down(|expr| { - match expr { - Expr::Exists { .. } - | Expr::ScalarSubquery(_) - | Expr::InSubquery(_) => { - // subqueries could contain aliases so we don't recurse into those - Ok(Transformed::new(expr, false, TreeNodeRecursion::Jump)) - } - Expr::Alias(_) => Ok(Transformed::new( - expr.unalias(), - true, - TreeNodeRecursion::Jump, - )), - _ => Ok(Transformed::no(expr)), - } - }) - .data()?; - Filter::try_new(predicate, input).map(LogicalPlan::Filter) } LogicalPlan::Repartition(_) => Ok(self), @@ -878,7 +843,7 @@ impl LogicalPlan { } LogicalPlan::Filter { .. } => { assert_eq!(1, expr.len()); - let predicate = expr.pop().unwrap().unalias_nested().data; + let predicate = expr.pop().unwrap(); Filter::try_new(predicate, Arc::new(inputs.swap_remove(0))) .map(LogicalPlan::Filter) @@ -2117,6 +2082,9 @@ pub struct Filter { impl Filter { /// Create a new filter operator. + /// + /// Notes: as Aliases have no effect on the output of a filter operator, + /// they are removed from the predicate expression. pub fn try_new(predicate: Expr, input: Arc) -> Result { // Filter predicates must return a boolean value so we try and validate that here. // Note that it is not always possible to resolve the predicate expression during plan @@ -2940,7 +2908,7 @@ mod tests { use crate::logical_plan::table_scan; use crate::{col, exists, in_subquery, lit, placeholder, GroupingSet}; - use datafusion_common::tree_node::TreeNodeVisitor; + use datafusion_common::tree_node::{TransformedResult, TreeNodeVisitor}; use datafusion_common::{not_impl_err, Constraint, ScalarValue}; use crate::test::function_stub::count; @@ -3500,11 +3468,8 @@ digraph { })); let col = schema.field_names()[0].clone(); - let filter = Filter::try_new( - Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), - scan, - ) - .unwrap(); + let filter = + Filter::try_new(Expr::Column(col.into()).eq(lit(1i32)), scan).unwrap(); assert!(filter.is_scalar()); } @@ -3522,8 +3487,7 @@ digraph { .build() .unwrap(); - let external_filter = - col("foo").eq(Expr::Literal(ScalarValue::Boolean(Some(true)))); + let external_filter = col("foo").eq(lit(true)); // after transformation, because plan is not the same anymore, // the parent plan is built again with call to LogicalPlan::with_new_inputs -> with_new_exprs diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 4a4933fe9cfd..e18d8bc91bf6 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -342,16 +342,9 @@ impl CommonSubexprEliminate { let input = unwrap_arc(input); let expr = vec![predicate]; self.try_unary_plan(expr, input, config)? - .transform_data(|(mut new_expr, new_input)| { + .map_data(|(mut new_expr, new_input)| { assert_eq!(new_expr.len(), 1); // passed in vec![predicate] - let new_predicate = new_expr - .pop() - .unwrap() - .unalias_nested() - .update_data(|new_predicate| (new_predicate, new_input)); - Ok(new_predicate) - })? - .map_data(|(new_predicate, new_input)| { + let new_predicate = new_expr.pop().unwrap(); Filter::try_new(new_predicate, Arc::new(new_input)) .map(LogicalPlan::Filter) }) diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 1c3186b762b7..0a3bae154bd6 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -761,11 +761,11 @@ impl OptimizerRule for PushDownFilter { // Push down non-unnest filter predicate // Unnest - // Unenst Input (Projection) + // Unnest Input (Projection) // -> rewritten to // Unnest // Filter - // Unenst Input (Projection) + // Unnest Input (Projection) let unnest_input = std::mem::take(&mut unnest.input); From 146b679aa19c7749cc73d0c27440419d6498142b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Wed, 10 Jul 2024 16:01:53 +0800 Subject: [PATCH 10/19] Enable `clone_on_ref_ptr` clippy lint on optimizer (#11346) * Enable clone_on_ref_ptr clippy lint on optimizer * Fix lints * fmt --- datafusion/optimizer/src/analyzer/subquery.rs | 5 ++-- .../optimizer/src/analyzer/type_coercion.rs | 8 +++--- .../optimizer/src/common_subexpr_eliminate.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 15 ++++++----- .../src/decorrelate_predicate_subquery.rs | 4 +-- .../optimizer/src/eliminate_cross_join.rs | 6 ++--- datafusion/optimizer/src/eliminate_filter.rs | 3 ++- datafusion/optimizer/src/eliminate_limit.rs | 3 ++- .../optimizer/src/eliminate_nested_union.rs | 2 +- .../optimizer/src/eliminate_one_union.rs | 2 +- .../optimizer/src/eliminate_outer_join.rs | 2 +- .../src/extract_equijoin_predicate.rs | 4 +-- datafusion/optimizer/src/lib.rs | 2 ++ .../optimizer/src/optimize_projections/mod.rs | 12 ++++----- datafusion/optimizer/src/optimizer.rs | 8 +++--- datafusion/optimizer/src/plan_signature.rs | 4 +-- .../optimizer/src/propagate_empty_relation.rs | 26 +++++++++---------- datafusion/optimizer/src/push_down_filter.rs | 10 +++---- .../optimizer/src/scalar_subquery_to_join.rs | 2 +- .../simplify_expressions/expr_simplifier.rs | 11 ++++---- .../src/single_distinct_to_groupby.rs | 2 +- datafusion/optimizer/src/test/mod.rs | 2 +- .../src/unwrap_cast_in_comparison.rs | 2 +- 23 files changed, 73 insertions(+), 64 deletions(-) diff --git a/datafusion/optimizer/src/analyzer/subquery.rs b/datafusion/optimizer/src/analyzer/subquery.rs index 5725a725e64a..db39f8f7737d 100644 --- a/datafusion/optimizer/src/analyzer/subquery.rs +++ b/datafusion/optimizer/src/analyzer/subquery.rs @@ -16,6 +16,7 @@ // under the License. use std::ops::Deref; +use std::sync::Arc; use crate::analyzer::check_plan; use crate::utils::collect_subquery_cols; @@ -245,7 +246,7 @@ fn check_aggregation_in_scalar_subquery( if !agg.group_expr.is_empty() { let correlated_exprs = get_correlated_expressions(inner_plan)?; let inner_subquery_cols = - collect_subquery_cols(&correlated_exprs, agg.input.schema().clone())?; + collect_subquery_cols(&correlated_exprs, Arc::clone(agg.input.schema()))?; let mut group_columns = agg .group_expr .iter() @@ -375,7 +376,7 @@ mod test { _inputs: Vec, ) -> Result { Ok(Self { - empty_schema: self.empty_schema.clone(), + empty_schema: Arc::clone(&self.empty_schema), }) } } diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 6c08b3e998b3..3cab474df84e 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -656,7 +656,7 @@ fn coerce_arguments_for_fun( .map(|expr| { let data_type = expr.get_type(schema).unwrap(); if let DataType::FixedSizeList(field, _) = data_type { - let to_type = DataType::List(field.clone()); + let to_type = DataType::List(Arc::clone(&field)); expr.cast_to(&to_type, schema) } else { Ok(expr) @@ -1265,8 +1265,10 @@ mod test { signature: Signature::variadic(vec![Utf8], Volatility::Immutable), }) .call(args.to_vec()); - let plan = - LogicalPlan::Projection(Projection::try_new(vec![expr], empty.clone())?); + let plan = LogicalPlan::Projection(Projection::try_new( + vec![expr], + Arc::clone(&empty), + )?); let expected = "Projection: TestScalarUDF(a, Utf8(\"b\"), CAST(Boolean(true) AS Utf8), CAST(Boolean(false) AS Utf8), CAST(Int32(13) AS Utf8))\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index e18d8bc91bf6..721987b917d4 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1385,7 +1385,7 @@ mod test { "my_agg", Signature::exact(vec![DataType::UInt32], Volatility::Stable), return_type.clone(), - accumulator.clone(), + Arc::clone(&accumulator), vec![Field::new("value", DataType::UInt32, true)], ))), vec![inner], diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 5f8e0a85215a..c998e8442548 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -19,6 +19,7 @@ use std::collections::{BTreeSet, HashMap}; use std::ops::Deref; +use std::sync::Arc; use crate::simplify_expressions::ExprSimplifier; use crate::utils::collect_subquery_cols; @@ -147,7 +148,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { } fn f_up(&mut self, plan: LogicalPlan) -> Result> { - let subquery_schema = plan.schema().clone(); + let subquery_schema = Arc::clone(plan.schema()); match &plan { LogicalPlan::Filter(plan_filter) => { let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); @@ -172,7 +173,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { if let Some(expr) = conjunction(subquery_filters.clone()) { filter_exprs_evaluation_result_on_empty_batch( &expr, - plan_filter.input.schema().clone(), + Arc::clone(plan_filter.input.schema()), expr_result_map, &mut expr_result_map_for_count_bug, )? @@ -230,7 +231,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { { proj_exprs_evaluation_result_on_empty_batch( &projection.expr, - projection.input.schema().clone(), + Arc::clone(projection.input.schema()), expr_result_map, &mut expr_result_map_for_count_bug, )?; @@ -276,7 +277,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { { agg_exprs_evaluation_result_on_empty_batch( &aggregate.aggr_expr, - aggregate.input.schema().clone(), + Arc::clone(aggregate.input.schema()), &mut expr_result_map_for_count_bug, )?; if !expr_result_map_for_count_bug.is_empty() { @@ -332,7 +333,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { if limit.fetch.filter(|limit_row| *limit_row == 0).is_some() { LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: limit.input.schema().clone(), + schema: Arc::clone(limit.input.schema()), }) } else { LogicalPlanBuilder::from((*limit.input).clone()).build()? @@ -456,7 +457,7 @@ fn agg_exprs_evaluation_result_on_empty_batch( let result_expr = result_expr.unalias(); let props = ExecutionProps::new(); - let info = SimplifyContext::new(&props).with_schema(schema.clone()); + let info = SimplifyContext::new(&props).with_schema(Arc::clone(&schema)); let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { @@ -492,7 +493,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( if result_expr.ne(expr) { let props = ExecutionProps::new(); - let info = SimplifyContext::new(&props).with_schema(schema.clone()); + let info = SimplifyContext::new(&props).with_schema(Arc::clone(&schema)); let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 81d6dc863af6..4e3ca7e33a2e 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -571,7 +571,7 @@ mod tests { ); let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) .filter( - in_subquery(col("customer.c_custkey"), orders.clone()) + in_subquery(col("customer.c_custkey"), Arc::clone(&orders)) .and(in_subquery(col("customer.c_custkey"), orders)), )? .project(vec![col("customer.c_custkey")])? @@ -1358,7 +1358,7 @@ mod tests { ); let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(orders.clone()).and(exists(orders)))? + .filter(exists(Arc::clone(&orders)).and(exists(orders)))? .project(vec![col("customer.c_custkey")])? .build()?; diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index 6d6f84373a36..729c45426ff2 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -86,7 +86,7 @@ impl OptimizerRule for EliminateCrossJoin { plan: LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { - let plan_schema = plan.schema().clone(); + let plan_schema = Arc::clone(plan.schema()); let mut possible_join_keys = JoinKeySet::new(); let mut all_inputs: Vec = vec![]; @@ -155,7 +155,7 @@ impl OptimizerRule for EliminateCrossJoin { if &plan_schema != left.schema() { left = LogicalPlan::Projection(Projection::new_from_schema( Arc::new(left), - plan_schema.clone(), + Arc::clone(&plan_schema), )); } @@ -420,7 +420,7 @@ mod tests { }; fn assert_optimized_plan_eq(plan: LogicalPlan, expected: Vec<&str>) { - let starting_schema = plan.schema().clone(); + let starting_schema = Arc::clone(plan.schema()); let rule = EliminateCrossJoin::new(); let transformed_plan = rule.rewrite(plan, &OptimizerContext::new()).unwrap(); assert!(transformed_plan.transformed, "failed to optimize plan"); diff --git a/datafusion/optimizer/src/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs index 7c873b411d59..2d8d77b89ddc 100644 --- a/datafusion/optimizer/src/eliminate_filter.rs +++ b/datafusion/optimizer/src/eliminate_filter.rs @@ -21,6 +21,7 @@ use datafusion_common::tree_node::Transformed; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::{EmptyRelation, Expr, Filter, LogicalPlan}; +use std::sync::Arc; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; @@ -68,7 +69,7 @@ impl OptimizerRule for EliminateFilter { Some(false) | None => Ok(Transformed::yes(LogicalPlan::EmptyRelation( EmptyRelation { produce_one_row: false, - schema: input.schema().clone(), + schema: Arc::clone(input.schema()), }, ))), }, diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs index b0a75fa47c27..165834e75975 100644 --- a/datafusion/optimizer/src/eliminate_limit.rs +++ b/datafusion/optimizer/src/eliminate_limit.rs @@ -21,6 +21,7 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::Transformed; use datafusion_common::Result; use datafusion_expr::logical_plan::{tree_node::unwrap_arc, EmptyRelation, LogicalPlan}; +use std::sync::Arc; /// Optimizer rule to replace `LIMIT 0` or `LIMIT` whose ancestor LIMIT's skip is /// greater than or equal to current's fetch @@ -67,7 +68,7 @@ impl OptimizerRule for EliminateLimit { return Ok(Transformed::yes(LogicalPlan::EmptyRelation( EmptyRelation { produce_one_row: false, - schema: limit.input.schema().clone(), + schema: Arc::clone(limit.input.schema()), }, ))); } diff --git a/datafusion/optimizer/src/eliminate_nested_union.rs b/datafusion/optimizer/src/eliminate_nested_union.rs index 3732f7ed90c8..c8ae937e128a 100644 --- a/datafusion/optimizer/src/eliminate_nested_union.rs +++ b/datafusion/optimizer/src/eliminate_nested_union.rs @@ -79,7 +79,7 @@ impl OptimizerRule for EliminateNestedUnion { Ok(Transformed::yes(LogicalPlan::Distinct(Distinct::All( Arc::new(LogicalPlan::Union(Union { inputs: inputs.into_iter().map(Arc::new).collect_vec(), - schema: schema.clone(), + schema: Arc::clone(&schema), })), )))) } diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index edf6b72d7e17..5e37b8cf7c1f 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -110,7 +110,7 @@ mod tests { &table_scan(Some("table"), &schema(), None)?.build()?, &schema().to_dfschema()?, )?; - let schema = table_plan.schema().clone(); + let schema = Arc::clone(table_plan.schema()); let single_union_plan = LogicalPlan::Union(Union { inputs: vec![Arc::new(table_plan)], schema, diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs index 13c483c6dfcc..12534e058152 100644 --- a/datafusion/optimizer/src/eliminate_outer_join.rs +++ b/datafusion/optimizer/src/eliminate_outer_join.rs @@ -118,7 +118,7 @@ impl OptimizerRule for EliminateOuterJoin { join_constraint: join.join_constraint, on: join.on.clone(), filter: join.filter.clone(), - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), null_equals_null: join.null_equals_null, })); Filter::try_new(filter.predicate, new_join) diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs index 87d205139e8e..0dae777ab5bd 100644 --- a/datafusion/optimizer/src/extract_equijoin_predicate.rs +++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs @@ -357,8 +357,8 @@ mod tests { let t1 = test_table_scan_with_name("t1")?; let t2 = test_table_scan_with_name("t2")?; - let t1_schema = t1.schema().clone(); - let t2_schema = t2.schema().clone(); + let t1_schema = Arc::clone(t1.schema()); + let t2_schema = Arc::clone(t2.schema()); // filter: t1.a + CAST(Int64(1), UInt32) = t2.a + CAST(Int64(2), UInt32) as t1.a + 1 = t2.a + 2 let filter = Expr::eq( diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs index a6a9e5cf26ea..332d3e9fe54e 100644 --- a/datafusion/optimizer/src/lib.rs +++ b/datafusion/optimizer/src/lib.rs @@ -14,6 +14,8 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +#![deny(clippy::clone_on_ref_ptr)] //! # DataFusion Optimizer //! diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 4684dbd3b043..cae2a7b2cad2 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -205,7 +205,7 @@ fn optimize_projections( }); } LogicalPlan::Window(window) => { - let input_schema = window.input.schema().clone(); + let input_schema = Arc::clone(window.input.schema()); // Split parent requirements to child and window expression sections: let n_input_fields = input_schema.fields().len(); // Offset window expression indices so that they point to valid @@ -881,7 +881,7 @@ mod tests { Ok(Self { exprs, input: Arc::new(inputs.swap_remove(0)), - schema: self.schema.clone(), + schema: Arc::clone(&self.schema), }) } @@ -949,7 +949,7 @@ mod tests { exprs, left_child: Arc::new(inputs.remove(0)), right_child: Arc::new(inputs.remove(0)), - schema: self.schema.clone(), + schema: Arc::clone(&self.schema), }) } @@ -1256,7 +1256,7 @@ mod tests { let table_scan = test_table_scan()?; let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(NoOpUserDefined::new( - table_scan.schema().clone(), + Arc::clone(table_scan.schema()), Arc::new(table_scan.clone()), )), }); @@ -1281,7 +1281,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new( NoOpUserDefined::new( - table_scan.schema().clone(), + Arc::clone(table_scan.schema()), Arc::new(table_scan.clone()), ) .with_exprs(exprs), @@ -1316,7 +1316,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new( NoOpUserDefined::new( - table_scan.schema().clone(), + Arc::clone(table_scan.schema()), Arc::new(table_scan.clone()), ) .with_exprs(exprs), diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 14e5ac141eeb..93923a4e1e74 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -205,7 +205,7 @@ impl OptimizerConfig for OptimizerContext { } fn alias_generator(&self) -> Arc { - self.alias_generator.clone() + Arc::clone(&self.alias_generator) } fn options(&self) -> &ConfigOptions { @@ -381,7 +381,7 @@ impl Optimizer { .skip_failed_rules .then(|| new_plan.clone()); - let starting_schema = new_plan.schema().clone(); + let starting_schema = Arc::clone(new_plan.schema()); let result = match rule.apply_order() { // optimizer handles recursion @@ -579,7 +579,7 @@ mod tests { let config = OptimizerContext::new().with_skip_failing_rules(false); let input = Arc::new(test_table_scan()?); - let input_schema = input.schema().clone(); + let input_schema = Arc::clone(input.schema()); let plan = LogicalPlan::Projection(Projection::try_new_with_schema( vec![col("a"), col("b"), col("c")], @@ -760,7 +760,7 @@ mod tests { } Ok(Transformed::yes(LogicalPlan::Projection( - Projection::try_new(exprs, projection.input.clone())?, + Projection::try_new(exprs, Arc::clone(&projection.input))?, ))) } } diff --git a/datafusion/optimizer/src/plan_signature.rs b/datafusion/optimizer/src/plan_signature.rs index d22795797478..73e6b418272a 100644 --- a/datafusion/optimizer/src/plan_signature.rs +++ b/datafusion/optimizer/src/plan_signature.rs @@ -100,7 +100,7 @@ mod tests { let one_node_plan = Arc::new(LogicalPlan::EmptyRelation(datafusion_expr::EmptyRelation { produce_one_row: false, - schema: schema.clone(), + schema: Arc::clone(&schema), })); assert_eq!(1, get_node_number(&one_node_plan).get()); @@ -112,7 +112,7 @@ mod tests { assert_eq!(2, get_node_number(&two_node_plan).get()); let five_node_plan = Arc::new(LogicalPlan::Union(datafusion_expr::Union { - inputs: vec![two_node_plan.clone(), two_node_plan], + inputs: vec![Arc::clone(&two_node_plan), two_node_plan], schema, })); diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 88bd1b17883b..91044207c4e1 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -79,7 +79,7 @@ impl OptimizerRule for PropagateEmptyRelation { return Ok(Transformed::yes(LogicalPlan::EmptyRelation( EmptyRelation { produce_one_row: false, - schema: plan.schema().clone(), + schema: Arc::clone(plan.schema()), }, ))); } @@ -99,43 +99,43 @@ impl OptimizerRule for PropagateEmptyRelation { JoinType::Full if left_empty && right_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), JoinType::Inner if left_empty || right_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), JoinType::Left if left_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), JoinType::Right if right_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), JoinType::LeftSemi if left_empty || right_empty => Ok( Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), })), ), JoinType::RightSemi if left_empty || right_empty => Ok( Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), })), ), JoinType::LeftAnti if left_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), JoinType::LeftAnti if right_empty => { @@ -147,7 +147,7 @@ impl OptimizerRule for PropagateEmptyRelation { JoinType::RightAnti if right_empty => Ok(Transformed::yes( LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: join.schema.clone(), + schema: Arc::clone(&join.schema), }), )), _ => Ok(Transformed::no(plan)), @@ -178,7 +178,7 @@ impl OptimizerRule for PropagateEmptyRelation { Ok(Transformed::yes(LogicalPlan::EmptyRelation( EmptyRelation { produce_one_row: false, - schema: plan.schema().clone(), + schema: Arc::clone(plan.schema()), }, ))) } else if new_inputs.len() == 1 { @@ -191,14 +191,14 @@ impl OptimizerRule for PropagateEmptyRelation { Ok(Transformed::yes(LogicalPlan::Projection( Projection::new_from_schema( Arc::new(child), - plan.schema().clone(), + Arc::clone(plan.schema()), ), ))) } } else { Ok(Transformed::yes(LogicalPlan::Union(Union { inputs: new_inputs, - schema: union.schema.clone(), + schema: Arc::clone(&union.schema), }))) } } @@ -232,7 +232,7 @@ fn empty_child(plan: &LogicalPlan) -> Result> { if !empty.produce_one_row { Ok(Some(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: plan.schema().clone(), + schema: Arc::clone(plan.schema()), }))) } else { Ok(None) diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 0a3bae154bd6..20e2ac07dffd 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -652,7 +652,7 @@ impl OptimizerRule for PushDownFilter { return push_down_join(join, None); }; - let plan_schema = plan.schema().clone(); + let plan_schema = Arc::clone(plan.schema()); let LogicalPlan::Filter(mut filter) = plan else { return Ok(Transformed::no(plan)); @@ -1498,7 +1498,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(NoopPlan { input: vec![table_scan.clone()], - schema: table_scan.schema().clone(), + schema: Arc::clone(table_scan.schema()), }), }); let plan = LogicalPlanBuilder::from(custom_plan) @@ -1514,7 +1514,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(NoopPlan { input: vec![table_scan.clone()], - schema: table_scan.schema().clone(), + schema: Arc::clone(table_scan.schema()), }), }); let plan = LogicalPlanBuilder::from(custom_plan) @@ -1531,7 +1531,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(NoopPlan { input: vec![table_scan.clone(), table_scan.clone()], - schema: table_scan.schema().clone(), + schema: Arc::clone(table_scan.schema()), }), }); let plan = LogicalPlanBuilder::from(custom_plan) @@ -1548,7 +1548,7 @@ mod tests { let custom_plan = LogicalPlan::Extension(Extension { node: Arc::new(NoopPlan { input: vec![table_scan.clone(), table_scan.clone()], - schema: table_scan.schema().clone(), + schema: Arc::clone(table_scan.schema()), }), }); let plan = LogicalPlanBuilder::from(custom_plan) diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 0333cc8dde36..35691847fb8e 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -413,7 +413,7 @@ mod tests { let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) .filter( lit(1) - .lt(scalar_subquery(orders.clone())) + .lt(scalar_subquery(Arc::clone(&orders))) .and(lit(1).lt(scalar_subquery(orders))), )? .project(vec![col("customer.c_custkey")])? diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 36dd85ac96e1..17855e17bef8 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1807,8 +1807,9 @@ mod tests { fn basic_coercion() { let schema = test_schema(); let props = ExecutionProps::new(); - let simplifier = - ExprSimplifier::new(SimplifyContext::new(&props).with_schema(schema.clone())); + let simplifier = ExprSimplifier::new( + SimplifyContext::new(&props).with_schema(Arc::clone(&schema)), + ); // Note expr type is int32 (not int64) // (1i64 + 2i32) < i @@ -3340,15 +3341,15 @@ mod tests { assert_eq!( simplify(in_list( col("c1"), - vec![scalar_subquery(subquery.clone())], + vec![scalar_subquery(Arc::clone(&subquery))], false )), - in_subquery(col("c1"), subquery.clone()) + in_subquery(col("c1"), Arc::clone(&subquery)) ); assert_eq!( simplify(in_list( col("c1"), - vec![scalar_subquery(subquery.clone())], + vec![scalar_subquery(Arc::clone(&subquery))], true )), not_in_subquery(col("c1"), subquery) diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 7c66d659cbaf..f2b4abdd6cbd 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -279,7 +279,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { let alias_str = format!("alias{}", index); inner_aggr_exprs.push( Expr::AggregateFunction(AggregateFunction::new_udf( - udf.clone(), + Arc::clone(&udf), args, false, None, diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs index 2c7e8644026e..4dccb42941dd 100644 --- a/datafusion/optimizer/src/test/mod.rs +++ b/datafusion/optimizer/src/test/mod.rs @@ -176,7 +176,7 @@ pub fn assert_optimized_plan_eq( // Apply the rule once let opt_context = OptimizerContext::new().with_max_passes(1); - let optimizer = Optimizer::with_rules(vec![rule.clone()]); + let optimizer = Optimizer::with_rules(vec![Arc::clone(&rule)]); let optimized_plan = optimizer.optimize(plan, &opt_context, observe)?; let formatted_plan = format!("{optimized_plan:?}"); assert_eq!(formatted_plan, expected); diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index de471d59c466..344708252559 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -832,7 +832,7 @@ mod tests { fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { let mut expr_rewriter = UnwrapCastExprRewriter { - schema: schema.clone(), + schema: Arc::clone(schema), }; expr.rewrite(&mut expr_rewriter).data().unwrap() } From 9a8f8b7188fba8dcba52028443b90556aeff7f5f Mon Sep 17 00:00:00 2001 From: Matthew Cramerus <8771538+suremarc@users.noreply.github.com> Date: Wed, 10 Jul 2024 08:14:28 -0500 Subject: [PATCH 11/19] fix: Fix eq properties regression from #10434 (#11363) * discover new orderings when constants are added * more comments * reduce nesting + describe argument * lint? --- .../src/equivalence/properties.rs | 183 +++++++++++------- 1 file changed, 113 insertions(+), 70 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index d9d19c0bcf47..8c327fbaf409 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -223,56 +223,11 @@ impl EquivalenceProperties { } } - // Discover new valid orderings in light of the new equality. For a discussion, see: - // https://github.com/apache/datafusion/issues/9812 - let mut new_orderings = vec![]; - for ordering in self.normalized_oeq_class().iter() { - let expressions = if left.eq(&ordering[0].expr) { - // Left expression is leading ordering - Some((ordering[0].options, right)) - } else if right.eq(&ordering[0].expr) { - // Right expression is leading ordering - Some((ordering[0].options, left)) - } else { - None - }; - if let Some((leading_ordering, other_expr)) = expressions { - // Currently, we only handle expressions with a single child. - // TODO: It should be possible to handle expressions orderings like - // f(a, b, c), a, b, c if f is monotonic in all arguments. - // First expression after leading ordering - if let Some(next_expr) = ordering.get(1) { - let children = other_expr.children(); - if children.len() == 1 - && children[0].eq(&next_expr.expr) - && SortProperties::Ordered(leading_ordering) - == other_expr - .get_properties(&[ExprProperties { - sort_properties: SortProperties::Ordered( - leading_ordering, - ), - range: Interval::make_unbounded( - &other_expr.data_type(&self.schema)?, - )?, - }])? - .sort_properties - { - // Assume existing ordering is [a ASC, b ASC] - // When equality a = f(b) is given, If we know that given ordering `[b ASC]`, ordering `[f(b) ASC]` is valid, - // then we can deduce that ordering `[b ASC]` is also valid. - // Hence, ordering `[b ASC]` can be added to the state as valid ordering. - // (e.g. existing ordering where leading ordering is removed) - new_orderings.push(ordering[1..].to_vec()); - } - } - } - } - if !new_orderings.is_empty() { - self.oeq_class.add_new_orderings(new_orderings); - } - // Add equal expressions to the state self.eq_group.add_equal_conditions(left, right); + + // Discover any new orderings + self.discover_new_orderings(left)?; Ok(()) } @@ -304,9 +259,78 @@ impl EquivalenceProperties { self.constants.push(const_expr); } } + + for ordering in self.normalized_oeq_class().iter() { + if let Err(e) = self.discover_new_orderings(&ordering[0].expr) { + log::debug!("error discovering new orderings: {e}"); + } + } + self } + // Discover new valid orderings in light of a new equality. + // Accepts a single argument (`expr`) which is used to determine + // which orderings should be updated. + // When constants or equivalence classes are changed, there may be new orderings + // that can be discovered with the new equivalence properties. + // For a discussion, see: https://github.com/apache/datafusion/issues/9812 + fn discover_new_orderings(&mut self, expr: &Arc) -> Result<()> { + let normalized_expr = self.eq_group().normalize_expr(Arc::clone(expr)); + let eq_class = self + .eq_group + .classes + .iter() + .find_map(|class| { + class + .contains(&normalized_expr) + .then(|| class.clone().into_vec()) + }) + .unwrap_or_else(|| vec![Arc::clone(&normalized_expr)]); + + let mut new_orderings: Vec = vec![]; + for (ordering, next_expr) in self + .normalized_oeq_class() + .iter() + .filter(|ordering| ordering[0].expr.eq(&normalized_expr)) + // First expression after leading ordering + .filter_map(|ordering| Some(ordering).zip(ordering.get(1))) + { + let leading_ordering = ordering[0].options; + // Currently, we only handle expressions with a single child. + // TODO: It should be possible to handle expressions orderings like + // f(a, b, c), a, b, c if f is monotonic in all arguments. + for equivalent_expr in &eq_class { + let children = equivalent_expr.children(); + if children.len() == 1 + && children[0].eq(&next_expr.expr) + && SortProperties::Ordered(leading_ordering) + == equivalent_expr + .get_properties(&[ExprProperties { + sort_properties: SortProperties::Ordered( + leading_ordering, + ), + range: Interval::make_unbounded( + &equivalent_expr.data_type(&self.schema)?, + )?, + }])? + .sort_properties + { + // Assume existing ordering is [a ASC, b ASC] + // When equality a = f(b) is given, If we know that given ordering `[b ASC]`, ordering `[f(b) ASC]` is valid, + // then we can deduce that ordering `[b ASC]` is also valid. + // Hence, ordering `[b ASC]` can be added to the state as valid ordering. + // (e.g. existing ordering where leading ordering is removed) + new_orderings.push(ordering[1..].to_vec()); + break; + } + } + } + + self.oeq_class.add_new_orderings(new_orderings); + Ok(()) + } + /// Updates the ordering equivalence group within assuming that the table /// is re-sorted according to the argument `sort_exprs`. Note that constants /// and equivalence classes are unchanged as they are unaffected by a re-sort. @@ -2454,30 +2478,49 @@ mod tests { ]; for case in cases { - let mut properties = base_properties - .clone() - .add_constants(case.constants.into_iter().map(ConstExpr::from)); - for [left, right] in &case.equal_conditions { - properties.add_equal_conditions(left, right)? - } - - let sort = case - .sort_columns - .iter() - .map(|&name| { - col(name, &schema).map(|col| PhysicalSortExpr { - expr: col, - options: SortOptions::default(), + // Construct the equivalence properties in different orders + // to exercise different code paths + // (The resulting properties _should_ be the same) + for properties in [ + // Equal conditions before constants + { + let mut properties = base_properties.clone(); + for [left, right] in &case.equal_conditions { + properties.add_equal_conditions(left, right)? + } + properties.add_constants( + case.constants.iter().cloned().map(ConstExpr::from), + ) + }, + // Constants before equal conditions + { + let mut properties = base_properties.clone().add_constants( + case.constants.iter().cloned().map(ConstExpr::from), + ); + for [left, right] in &case.equal_conditions { + properties.add_equal_conditions(left, right)? + } + properties + }, + ] { + let sort = case + .sort_columns + .iter() + .map(|&name| { + col(name, &schema).map(|col| PhysicalSortExpr { + expr: col, + options: SortOptions::default(), + }) }) - }) - .collect::>>()?; + .collect::>>()?; - assert_eq!( - properties.ordering_satisfy(&sort), - case.should_satisfy_ordering, - "failed test '{}'", - case.name - ); + assert_eq!( + properties.ordering_satisfy(&sort), + case.should_satisfy_ordering, + "failed test '{}'", + case.name + ); + } } Ok(()) From d99002cf0b39843afb9c224bbb880d2266acefc6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 09:24:55 -0400 Subject: [PATCH 12/19] Update termtree requirement from 0.4.1 to 0.5.0 (#11383) Updates the requirements on [termtree](https://github.com/rust-cli/termtree) to permit the latest version. - [Changelog](https://github.com/rust-cli/termtree/blob/main/CHANGELOG.md) - [Commits](https://github.com/rust-cli/termtree/compare/v0.4.1...v0.5.0) --- updated-dependencies: - dependency-name: termtree dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- datafusion/physical-plan/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 4292f95fe406..f5f756417ebf 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -66,7 +66,7 @@ tokio = { workspace = true } [dev-dependencies] rstest = { workspace = true } rstest_reuse = "0.7.0" -termtree = "0.4.1" +termtree = "0.5.0" tokio = { workspace = true, features = [ "rt-multi-thread", "fs", From b96186fdef1ff410663ec8fce41186c018f8e09a Mon Sep 17 00:00:00 2001 From: Oleks V Date: Wed, 10 Jul 2024 08:09:51 -0700 Subject: [PATCH 13/19] Introduce `resources_err!` error macro (#11374) --- datafusion/common/src/error.rs | 3 +++ datafusion/execution/src/disk_manager.rs | 6 +++--- datafusion/execution/src/memory_pool/pool.rs | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index b1fdb652af48..9be662ca283e 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -553,6 +553,9 @@ make_error!(config_err, config_datafusion_err, Configuration); // Exposes a macro to create `DataFusionError::Substrait` with optional backtrace make_error!(substrait_err, substrait_datafusion_err, Substrait); +// Exposes a macro to create `DataFusionError::ResourcesExhausted` with optional backtrace +make_error!(resources_err, resources_datafusion_err, ResourcesExhausted); + // Exposes a macro to create `DataFusionError::SQL` with optional backtrace #[macro_export] macro_rules! sql_datafusion_err { diff --git a/datafusion/execution/src/disk_manager.rs b/datafusion/execution/src/disk_manager.rs index cca25c7c3e88..c98d7e5579f0 100644 --- a/datafusion/execution/src/disk_manager.rs +++ b/datafusion/execution/src/disk_manager.rs @@ -18,7 +18,7 @@ //! Manages files generated during query execution, files are //! hashed among the directories listed in RuntimeConfig::local_dirs. -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::{resources_datafusion_err, DataFusionError, Result}; use log::debug; use parking_lot::Mutex; use rand::{thread_rng, Rng}; @@ -119,9 +119,9 @@ impl DiskManager { ) -> Result { let mut guard = self.local_dirs.lock(); let local_dirs = guard.as_mut().ok_or_else(|| { - DataFusionError::ResourcesExhausted(format!( + resources_datafusion_err!( "Memory Exhausted while {request_description} (DiskManager is disabled)" - )) + ) })?; // Create a temporary directory if needed diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index 4a491630fe20..fd7724f3076c 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -16,7 +16,7 @@ // under the License. use crate::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::{resources_datafusion_err, DataFusionError, Result}; use log::debug; use parking_lot::Mutex; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -231,12 +231,13 @@ impl MemoryPool for FairSpillPool { } } +#[inline(always)] fn insufficient_capacity_err( reservation: &MemoryReservation, additional: usize, available: usize, ) -> DataFusionError { - DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", additional, reservation.registration.consumer.name, reservation.size, available)) + resources_datafusion_err!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", additional, reservation.registration.consumer.name, reservation.size, available) } #[cfg(test)] From 585504a31fd7d9a44c97f3f19af42bace08b8cc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Thu, 11 Jul 2024 00:58:53 +0800 Subject: [PATCH 14/19] Enable clone_on_ref_ptr clippy lint on common (#11384) --- datafusion/common-runtime/src/lib.rs | 2 + datafusion/common/src/dfschema.rs | 12 +-- datafusion/common/src/hash_utils.rs | 19 ++-- datafusion/common/src/lib.rs | 2 + datafusion/common/src/scalar/mod.rs | 97 ++++++++++--------- .../common/src/scalar/struct_builder.rs | 2 +- datafusion/common/src/utils/mod.rs | 5 +- 7 files changed, 77 insertions(+), 62 deletions(-) diff --git a/datafusion/common-runtime/src/lib.rs b/datafusion/common-runtime/src/lib.rs index e8624163f224..8145bb110464 100644 --- a/datafusion/common-runtime/src/lib.rs +++ b/datafusion/common-runtime/src/lib.rs @@ -14,6 +14,8 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +#![deny(clippy::clone_on_ref_ptr)] pub mod common; diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 3c2cc89fc014..7598cbc4d86a 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -211,7 +211,7 @@ impl DFSchema { schema: &SchemaRef, ) -> Result { let dfschema = Self { - inner: schema.clone(), + inner: Arc::clone(schema), field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; @@ -311,7 +311,7 @@ impl DFSchema { }; if !duplicated_field { // self.inner.fields.push(field.clone()); - schema_builder.push(field.clone()); + schema_builder.push(Arc::clone(field)); qualifiers.push(qualifier.cloned()); } } @@ -1276,7 +1276,7 @@ mod tests { let arrow_schema_ref = Arc::new(arrow_schema.clone()); let df_schema = DFSchema { - inner: arrow_schema_ref.clone(), + inner: Arc::clone(&arrow_schema_ref), field_qualifiers: vec![None; arrow_schema_ref.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; @@ -1284,7 +1284,7 @@ mod tests { { let arrow_schema = arrow_schema.clone(); - let arrow_schema_ref = arrow_schema_ref.clone(); + let arrow_schema_ref = Arc::clone(&arrow_schema_ref); assert_eq!(df_schema, arrow_schema.to_dfschema().unwrap()); assert_eq!(df_schema, arrow_schema_ref.to_dfschema().unwrap()); @@ -1292,7 +1292,7 @@ mod tests { { let arrow_schema = arrow_schema.clone(); - let arrow_schema_ref = arrow_schema_ref.clone(); + let arrow_schema_ref = Arc::clone(&arrow_schema_ref); assert_eq!(df_schema_ref, arrow_schema.to_dfschema_ref().unwrap()); assert_eq!(df_schema_ref, arrow_schema_ref.to_dfschema_ref().unwrap()); @@ -1322,7 +1322,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![a_field, b_field])); let df_schema = DFSchema { - inner: schema.clone(), + inner: Arc::clone(&schema), field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index c972536c4d23..c8adae34f645 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -244,7 +244,7 @@ fn hash_list_array( where OffsetSize: OffsetSizeTrait, { - let values = array.values().clone(); + let values = Arc::clone(array.values()); let offsets = array.value_offsets(); let nulls = array.nulls(); let mut values_hashes = vec![0u64; values.len()]; @@ -274,7 +274,7 @@ fn hash_fixed_list_array( random_state: &RandomState, hashes_buffer: &mut [u64], ) -> Result<()> { - let values = array.values().clone(); + let values = Arc::clone(array.values()); let value_len = array.value_length(); let offset_size = value_len as usize / array.len(); let nulls = array.nulls(); @@ -622,19 +622,19 @@ mod tests { vec![ ( Arc::new(Field::new("bool", DataType::Boolean, false)), - boolarr.clone() as ArrayRef, + Arc::clone(&boolarr) as ArrayRef, ), ( Arc::new(Field::new("i32", DataType::Int32, false)), - i32arr.clone() as ArrayRef, + Arc::clone(&i32arr) as ArrayRef, ), ( Arc::new(Field::new("i32", DataType::Int32, false)), - i32arr.clone() as ArrayRef, + Arc::clone(&i32arr) as ArrayRef, ), ( Arc::new(Field::new("bool", DataType::Boolean, false)), - boolarr.clone() as ArrayRef, + Arc::clone(&boolarr) as ArrayRef, ), ], Buffer::from(&[0b001011]), @@ -710,7 +710,12 @@ mod tests { let random_state = RandomState::with_seeds(0, 0, 0, 0); let mut one_col_hashes = vec![0; strings1.len()]; - create_hashes(&[dict_array.clone()], &random_state, &mut one_col_hashes).unwrap(); + create_hashes( + &[Arc::clone(&dict_array) as ArrayRef], + &random_state, + &mut one_col_hashes, + ) + .unwrap(); let mut two_col_hashes = vec![0; strings1.len()]; create_hashes( diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index c275152642f0..8cd64e7d16a2 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -14,6 +14,8 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +#![deny(clippy::clone_on_ref_ptr)] mod column; mod dfschema; diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 26e03a3b9893..c8f21788cbbd 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1758,8 +1758,11 @@ impl ScalarValue { if let Some(DataType::FixedSizeList(f, l)) = first_non_null_data_type { for array in arrays.iter_mut() { if array.is_null(0) { - *array = - Arc::new(FixedSizeListArray::new_null(f.clone(), l, 1)); + *array = Arc::new(FixedSizeListArray::new_null( + Arc::clone(&f), + l, + 1, + )); } } } @@ -3298,16 +3301,16 @@ impl TryFrom<&DataType> for ScalarValue { ), // `ScalaValue::List` contains single element `ListArray`. DataType::List(field_ref) => ScalarValue::List(Arc::new( - GenericListArray::new_null(field_ref.clone(), 1), + GenericListArray::new_null(Arc::clone(field_ref), 1), )), // `ScalarValue::LargeList` contains single element `LargeListArray`. DataType::LargeList(field_ref) => ScalarValue::LargeList(Arc::new( - GenericListArray::new_null(field_ref.clone(), 1), + GenericListArray::new_null(Arc::clone(field_ref), 1), )), // `ScalaValue::FixedSizeList` contains single element `FixedSizeList`. DataType::FixedSizeList(field_ref, fixed_length) => { ScalarValue::FixedSizeList(Arc::new(FixedSizeListArray::new_null( - field_ref.clone(), + Arc::clone(field_ref), *fixed_length, 1, ))) @@ -3746,11 +3749,11 @@ mod tests { let expected = StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Boolean, false)), - boolean.clone() as ArrayRef, + Arc::clone(&boolean) as ArrayRef, ), ( Arc::new(Field::new("c", DataType::Int32, false)), - int.clone() as ArrayRef, + Arc::clone(&int) as ArrayRef, ), ]); @@ -3792,11 +3795,11 @@ mod tests { let struct_array = StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Boolean, false)), - boolean.clone() as ArrayRef, + Arc::clone(&boolean) as ArrayRef, ), ( Arc::new(Field::new("c", DataType::Int32, false)), - int.clone() as ArrayRef, + Arc::clone(&int) as ArrayRef, ), ]); let sv = ScalarValue::Struct(Arc::new(struct_array)); @@ -3810,11 +3813,11 @@ mod tests { let struct_array = StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Boolean, false)), - boolean.clone() as ArrayRef, + Arc::clone(&boolean) as ArrayRef, ), ( Arc::new(Field::new("c", DataType::Int32, false)), - int.clone() as ArrayRef, + Arc::clone(&int) as ArrayRef, ), ]); @@ -3846,7 +3849,7 @@ mod tests { fn test_to_array_of_size_for_fsl() { let values = Int32Array::from_iter([Some(1), None, Some(2)]); let field = Arc::new(Field::new("item", DataType::Int32, true)); - let arr = FixedSizeListArray::new(field.clone(), 3, Arc::new(values), None); + let arr = FixedSizeListArray::new(Arc::clone(&field), 3, Arc::new(values), None); let sv = ScalarValue::FixedSizeList(Arc::new(arr)); let actual_arr = sv .to_array_of_size(2) @@ -3932,13 +3935,13 @@ mod tests { fn test_iter_to_array_fixed_size_list() { let field = Arc::new(Field::new("item", DataType::Int32, true)); let f1 = Arc::new(FixedSizeListArray::new( - field.clone(), + Arc::clone(&field), 3, Arc::new(Int32Array::from(vec![1, 2, 3])), None, )); let f2 = Arc::new(FixedSizeListArray::new( - field.clone(), + Arc::clone(&field), 3, Arc::new(Int32Array::from(vec![4, 5, 6])), None, @@ -3946,7 +3949,7 @@ mod tests { let f_nulls = Arc::new(FixedSizeListArray::new_null(field, 1, 1)); let scalars = vec![ - ScalarValue::FixedSizeList(f_nulls.clone()), + ScalarValue::FixedSizeList(Arc::clone(&f_nulls)), ScalarValue::FixedSizeList(f1), ScalarValue::FixedSizeList(f2), ScalarValue::FixedSizeList(f_nulls), @@ -4780,7 +4783,7 @@ mod tests { let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); // Test for List - let data_type = &DataType::List(inner_field.clone()); + let data_type = &DataType::List(Arc::clone(&inner_field)); let scalar: ScalarValue = data_type.try_into().unwrap(); let expected = ScalarValue::List( new_null_array(data_type, 1) @@ -4792,7 +4795,7 @@ mod tests { assert!(expected.is_null()); // Test for LargeList - let data_type = &DataType::LargeList(inner_field.clone()); + let data_type = &DataType::LargeList(Arc::clone(&inner_field)); let scalar: ScalarValue = data_type.try_into().unwrap(); let expected = ScalarValue::LargeList( new_null_array(data_type, 1) @@ -4804,7 +4807,7 @@ mod tests { assert!(expected.is_null()); // Test for FixedSizeList(5) - let data_type = &DataType::FixedSizeList(inner_field.clone(), 5); + let data_type = &DataType::FixedSizeList(Arc::clone(&inner_field), 5); let scalar: ScalarValue = data_type.try_into().unwrap(); let expected = ScalarValue::FixedSizeList( new_null_array(data_type, 1) @@ -5212,35 +5215,35 @@ mod tests { let field_f = Arc::new(Field::new("f", DataType::Int64, false)); let field_d = Arc::new(Field::new( "D", - DataType::Struct(vec![field_e.clone(), field_f.clone()].into()), + DataType::Struct(vec![Arc::clone(&field_e), Arc::clone(&field_f)].into()), false, )); let struct_array = StructArray::from(vec![ ( - field_e.clone(), + Arc::clone(&field_e), Arc::new(Int16Array::from(vec![2])) as ArrayRef, ), ( - field_f.clone(), + Arc::clone(&field_f), Arc::new(Int64Array::from(vec![3])) as ArrayRef, ), ]); let struct_array = StructArray::from(vec![ ( - field_a.clone(), + Arc::clone(&field_a), Arc::new(Int32Array::from(vec![23])) as ArrayRef, ), ( - field_b.clone(), + Arc::clone(&field_b), Arc::new(BooleanArray::from(vec![false])) as ArrayRef, ), ( - field_c.clone(), + Arc::clone(&field_c), Arc::new(StringArray::from(vec!["Hello"])) as ArrayRef, ), - (field_d.clone(), Arc::new(struct_array) as ArrayRef), + (Arc::clone(&field_d), Arc::new(struct_array) as ArrayRef), ]); let scalar = ScalarValue::Struct(Arc::new(struct_array)); @@ -5250,26 +5253,26 @@ mod tests { let expected = Arc::new(StructArray::from(vec![ ( - field_a.clone(), + Arc::clone(&field_a), Arc::new(Int32Array::from(vec![23, 23])) as ArrayRef, ), ( - field_b.clone(), + Arc::clone(&field_b), Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef, ), ( - field_c.clone(), + Arc::clone(&field_c), Arc::new(StringArray::from(vec!["Hello", "Hello"])) as ArrayRef, ), ( - field_d.clone(), + Arc::clone(&field_d), Arc::new(StructArray::from(vec![ ( - field_e.clone(), + Arc::clone(&field_e), Arc::new(Int16Array::from(vec![2, 2])) as ArrayRef, ), ( - field_f.clone(), + Arc::clone(&field_f), Arc::new(Int64Array::from(vec![3, 3])) as ArrayRef, ), ])) as ArrayRef, @@ -5348,26 +5351,26 @@ mod tests { let expected = Arc::new(StructArray::from(vec![ ( - field_a.clone(), + Arc::clone(&field_a), Arc::new(Int32Array::from(vec![23, 7, -1000])) as ArrayRef, ), ( - field_b.clone(), + Arc::clone(&field_b), Arc::new(BooleanArray::from(vec![false, true, true])) as ArrayRef, ), ( - field_c.clone(), + Arc::clone(&field_c), Arc::new(StringArray::from(vec!["Hello", "World", "!!!!!"])) as ArrayRef, ), ( - field_d.clone(), + Arc::clone(&field_d), Arc::new(StructArray::from(vec![ ( - field_e.clone(), + Arc::clone(&field_e), Arc::new(Int16Array::from(vec![2, 4, 6])) as ArrayRef, ), ( - field_f.clone(), + Arc::clone(&field_f), Arc::new(Int64Array::from(vec![3, 5, 7])) as ArrayRef, ), ])) as ArrayRef, @@ -5431,11 +5434,11 @@ mod tests { let array = as_struct_array(&array).unwrap(); let expected = StructArray::from(vec![ ( - field_a.clone(), + Arc::clone(&field_a), Arc::new(StringArray::from(vec!["First", "Second", "Third"])) as ArrayRef, ), ( - field_primitive_list.clone(), + Arc::clone(&field_primitive_list), Arc::new(ListArray::from_iter_primitive::(vec![ Some(vec![Some(1), Some(2), Some(3)]), Some(vec![Some(4), Some(5)]), @@ -6195,18 +6198,18 @@ mod tests { let struct_value = vec![ ( - fields[0].clone(), + Arc::clone(&fields[0]), Arc::new(UInt64Array::from(vec![Some(1)])) as ArrayRef, ), ( - fields[1].clone(), + Arc::clone(&fields[1]), Arc::new(StructArray::from(vec![ ( - fields_b[0].clone(), + Arc::clone(&fields_b[0]), Arc::new(UInt64Array::from(vec![Some(2)])) as ArrayRef, ), ( - fields_b[1].clone(), + Arc::clone(&fields_b[1]), Arc::new(UInt64Array::from(vec![Some(3)])) as ArrayRef, ), ])) as ArrayRef, @@ -6215,19 +6218,19 @@ mod tests { let struct_value_with_nulls = vec![ ( - fields[0].clone(), + Arc::clone(&fields[0]), Arc::new(UInt64Array::from(vec![Some(1)])) as ArrayRef, ), ( - fields[1].clone(), + Arc::clone(&fields[1]), Arc::new(StructArray::from(( vec![ ( - fields_b[0].clone(), + Arc::clone(&fields_b[0]), Arc::new(UInt64Array::from(vec![Some(2)])) as ArrayRef, ), ( - fields_b[1].clone(), + Arc::clone(&fields_b[1]), Arc::new(UInt64Array::from(vec![Some(3)])) as ArrayRef, ), ], diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index b1a34e4a61d0..4a6a8f0289a7 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -144,7 +144,7 @@ impl IntoFieldRef for FieldRef { impl IntoFieldRef for &FieldRef { fn into_field_ref(self) -> FieldRef { - self.clone() + Arc::clone(self) } } diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index dd7b80333cf8..8264b4872592 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -245,7 +245,10 @@ pub fn evaluate_partition_ranges( end: num_rows, }] } else { - let cols: Vec<_> = partition_columns.iter().map(|x| x.values.clone()).collect(); + let cols: Vec<_> = partition_columns + .iter() + .map(|x| Arc::clone(&x.values)) + .collect(); partition(&cols)?.ranges() }) } From 6038f4cfac536dbb54ea2761828f7344a23b94f0 Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 10 Jul 2024 11:21:01 -0700 Subject: [PATCH 15/19] Track parquet writer encoding memory usage on MemoryPool (#11345) * feat(11344): track memory used for non-parallel writes * feat(11344): track memory usage during parallel writes * test(11344): create bounded stream for testing * test(11344): test ParquetSink memory reservation * feat(11344): track bytes in file writer * refactor(11344): tweak the ordering to add col bytes to rg_reservation, before selecting shrinking for data bytes flushed * refactor: move each col_reservation and rg_reservation to match the parallelized call stack for col vs rg * test(11344): add memory_limit enforcement test for parquet sink * chore: cleanup to remove unnecessary reservation management steps * fix: fix CI test failure due to file extension rename --- .../src/datasource/file_format/parquet.rs | 165 ++++++++++++++++-- datafusion/core/src/test_util/mod.rs | 36 ++++ datafusion/core/tests/memory_limit/mod.rs | 25 +++ 3 files changed, 216 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 27d783cd89b5..694c94928537 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -48,6 +48,7 @@ use datafusion_common::{ DEFAULT_PARQUET_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; +use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use datafusion_execution::TaskContext; use datafusion_physical_expr::expressions::{MaxAccumulator, MinAccumulator}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; @@ -749,9 +750,13 @@ impl DataSink for ParquetSink { parquet_props.writer_options().clone(), ) .await?; + let mut reservation = + MemoryConsumer::new(format!("ParquetSink[{}]", path)) + .register(context.memory_pool()); file_write_tasks.spawn(async move { while let Some(batch) = rx.recv().await { writer.write(&batch).await?; + reservation.try_resize(writer.memory_size())?; } let file_metadata = writer .close() @@ -771,6 +776,7 @@ impl DataSink for ParquetSink { let schema = self.get_writer_schema(); let props = parquet_props.clone(); let parallel_options_clone = parallel_options.clone(); + let pool = Arc::clone(context.memory_pool()); file_write_tasks.spawn(async move { let file_metadata = output_single_parquet_file_parallelized( writer, @@ -778,6 +784,7 @@ impl DataSink for ParquetSink { schema, props.writer_options(), parallel_options_clone, + pool, ) .await?; Ok((path, file_metadata)) @@ -818,14 +825,16 @@ impl DataSink for ParquetSink { async fn column_serializer_task( mut rx: Receiver, mut writer: ArrowColumnWriter, -) -> Result { + mut reservation: MemoryReservation, +) -> Result<(ArrowColumnWriter, MemoryReservation)> { while let Some(col) = rx.recv().await { writer.write(&col)?; + reservation.try_resize(writer.memory_size())?; } - Ok(writer) + Ok((writer, reservation)) } -type ColumnWriterTask = SpawnedTask>; +type ColumnWriterTask = SpawnedTask>; type ColSender = Sender; /// Spawns a parallel serialization task for each column @@ -835,6 +844,7 @@ fn spawn_column_parallel_row_group_writer( schema: Arc, parquet_props: Arc, max_buffer_size: usize, + pool: &Arc, ) -> Result<(Vec, Vec)> { let schema_desc = arrow_to_parquet_schema(&schema)?; let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; @@ -848,7 +858,13 @@ fn spawn_column_parallel_row_group_writer( mpsc::channel::(max_buffer_size); col_array_channels.push(send_array); - let task = SpawnedTask::spawn(column_serializer_task(recieve_array, writer)); + let reservation = + MemoryConsumer::new("ParquetSink(ArrowColumnWriter)").register(pool); + let task = SpawnedTask::spawn(column_serializer_task( + recieve_array, + writer, + reservation, + )); col_writer_tasks.push(task); } @@ -864,7 +880,7 @@ struct ParallelParquetWriterOptions { /// This is the return type of calling [ArrowColumnWriter].close() on each column /// i.e. the Vec of encoded columns which can be appended to a row group -type RBStreamSerializeResult = Result<(Vec, usize)>; +type RBStreamSerializeResult = Result<(Vec, MemoryReservation, usize)>; /// Sends the ArrowArrays in passed [RecordBatch] through the channels to their respective /// parallel column serializers. @@ -895,16 +911,22 @@ async fn send_arrays_to_col_writers( fn spawn_rg_join_and_finalize_task( column_writer_tasks: Vec, rg_rows: usize, + pool: &Arc, ) -> SpawnedTask { + let mut rg_reservation = + MemoryConsumer::new("ParquetSink(SerializedRowGroupWriter)").register(pool); + SpawnedTask::spawn(async move { let num_cols = column_writer_tasks.len(); let mut finalized_rg = Vec::with_capacity(num_cols); for task in column_writer_tasks.into_iter() { - let writer = task.join_unwind().await?; + let (writer, _col_reservation) = task.join_unwind().await?; + let encoded_size = writer.get_estimated_total_bytes(); + rg_reservation.grow(encoded_size); finalized_rg.push(writer.close()?); } - Ok((finalized_rg, rg_rows)) + Ok((finalized_rg, rg_reservation, rg_rows)) }) } @@ -922,6 +944,7 @@ fn spawn_parquet_parallel_serialization_task( schema: Arc, writer_props: Arc, parallel_options: ParallelParquetWriterOptions, + pool: Arc, ) -> SpawnedTask> { SpawnedTask::spawn(async move { let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream; @@ -931,6 +954,7 @@ fn spawn_parquet_parallel_serialization_task( schema.clone(), writer_props.clone(), max_buffer_rb, + &pool, )?; let mut current_rg_rows = 0; @@ -957,6 +981,7 @@ fn spawn_parquet_parallel_serialization_task( let finalize_rg_task = spawn_rg_join_and_finalize_task( column_writer_handles, max_row_group_rows, + &pool, ); serialize_tx.send(finalize_rg_task).await.map_err(|_| { @@ -973,6 +998,7 @@ fn spawn_parquet_parallel_serialization_task( schema.clone(), writer_props.clone(), max_buffer_rb, + &pool, )?; } } @@ -981,8 +1007,11 @@ fn spawn_parquet_parallel_serialization_task( drop(col_array_channels); // Handle leftover rows as final rowgroup, which may be smaller than max_row_group_rows if current_rg_rows > 0 { - let finalize_rg_task = - spawn_rg_join_and_finalize_task(column_writer_handles, current_rg_rows); + let finalize_rg_task = spawn_rg_join_and_finalize_task( + column_writer_handles, + current_rg_rows, + &pool, + ); serialize_tx.send(finalize_rg_task).await.map_err(|_| { DataFusionError::Internal( @@ -1002,9 +1031,13 @@ async fn concatenate_parallel_row_groups( schema: Arc, writer_props: Arc, mut object_store_writer: Box, + pool: Arc, ) -> Result { let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES); + let mut file_reservation = + MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); + let schema_desc = arrow_to_parquet_schema(schema.as_ref())?; let mut parquet_writer = SerializedFileWriter::new( merged_buff.clone(), @@ -1015,15 +1048,20 @@ async fn concatenate_parallel_row_groups( while let Some(task) = serialize_rx.recv().await { let result = task.join_unwind().await; let mut rg_out = parquet_writer.next_row_group()?; - let (serialized_columns, _cnt) = result?; + let (serialized_columns, mut rg_reservation, _cnt) = result?; for chunk in serialized_columns { chunk.append_to_row_group(&mut rg_out)?; + rg_reservation.free(); + let mut buff_to_flush = merged_buff.buffer.try_lock().unwrap(); + file_reservation.try_resize(buff_to_flush.len())?; + if buff_to_flush.len() > BUFFER_FLUSH_BYTES { object_store_writer .write_all(buff_to_flush.as_slice()) .await?; buff_to_flush.clear(); + file_reservation.try_resize(buff_to_flush.len())?; // will set to zero } } rg_out.close()?; @@ -1034,6 +1072,7 @@ async fn concatenate_parallel_row_groups( object_store_writer.write_all(final_buff.as_slice()).await?; object_store_writer.shutdown().await?; + file_reservation.free(); Ok(file_metadata) } @@ -1048,6 +1087,7 @@ async fn output_single_parquet_file_parallelized( output_schema: Arc, parquet_props: &WriterProperties, parallel_options: ParallelParquetWriterOptions, + pool: Arc, ) -> Result { let max_rowgroups = parallel_options.max_parallel_row_groups; // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel @@ -1061,12 +1101,14 @@ async fn output_single_parquet_file_parallelized( output_schema.clone(), arc_props.clone(), parallel_options, + Arc::clone(&pool), ); let file_metadata = concatenate_parallel_row_groups( serialize_rx, output_schema.clone(), arc_props.clone(), object_store_writer, + pool, ) .await?; @@ -1158,8 +1200,10 @@ mod tests { use super::super::test_util::scan_format; use crate::datasource::listing::{ListingTableUrl, PartitionedFile}; use crate::physical_plan::collect; + use crate::test_util::bounded_stream; use std::fmt::{Display, Formatter}; use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; use super::*; @@ -2177,4 +2221,105 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn parquet_sink_write_memory_reservation() -> Result<()> { + async fn test_memory_reservation(global: ParquetOptions) -> Result<()> { + let field_a = Field::new("a", DataType::Utf8, false); + let field_b = Field::new("b", DataType::Utf8, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + let object_store_url = ObjectStoreUrl::local_filesystem(); + + let file_sink_config = FileSinkConfig { + object_store_url: object_store_url.clone(), + file_groups: vec![PartitionedFile::new("/tmp".to_string(), 1)], + table_paths: vec![ListingTableUrl::parse("file:///")?], + output_schema: schema.clone(), + table_partition_cols: vec![], + overwrite: true, + keep_partition_by_columns: false, + }; + let parquet_sink = Arc::new(ParquetSink::new( + file_sink_config, + TableParquetOptions { + key_value_metadata: std::collections::HashMap::from([ + ("my-data".to_string(), Some("stuff".to_string())), + ("my-data-bool-key".to_string(), None), + ]), + global, + ..Default::default() + }, + )); + + // create data + let col_a: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"])); + let col_b: ArrayRef = Arc::new(StringArray::from(vec!["baz", "baz"])); + let batch = + RecordBatch::try_from_iter(vec![("a", col_a), ("b", col_b)]).unwrap(); + + // create task context + let task_context = build_ctx(object_store_url.as_ref()); + assert_eq!( + task_context.memory_pool().reserved(), + 0, + "no bytes are reserved yet" + ); + + let mut write_task = parquet_sink.write_all( + Box::pin(RecordBatchStreamAdapter::new( + schema, + bounded_stream(batch, 1000), + )), + &task_context, + ); + + // incrementally poll and check for memory reservation + let mut reserved_bytes = 0; + while futures::poll!(&mut write_task).is_pending() { + reserved_bytes += task_context.memory_pool().reserved(); + tokio::time::sleep(Duration::from_micros(1)).await; + } + assert!( + reserved_bytes > 0, + "should have bytes reserved during write" + ); + assert_eq!( + task_context.memory_pool().reserved(), + 0, + "no leaking byte reservation" + ); + + Ok(()) + } + + let write_opts = ParquetOptions { + allow_single_file_parallelism: false, + ..Default::default() + }; + test_memory_reservation(write_opts) + .await + .expect("should track for non-parallel writes"); + + let row_parallel_write_opts = ParquetOptions { + allow_single_file_parallelism: true, + maximum_parallel_row_group_writers: 10, + maximum_buffered_record_batches_per_stream: 1, + ..Default::default() + }; + test_memory_reservation(row_parallel_write_opts) + .await + .expect("should track for row-parallel writes"); + + let col_parallel_write_opts = ParquetOptions { + allow_single_file_parallelism: true, + maximum_parallel_row_group_writers: 1, + maximum_buffered_record_batches_per_stream: 2, + ..Default::default() + }; + test_memory_reservation(col_parallel_write_opts) + .await + .expect("should track for column-parallel writes"); + + Ok(()) + } } diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index 059fa8fc6da7..ba0509f3f51a 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -366,3 +366,39 @@ pub fn register_unbounded_file_with_ordering( ctx.register_table(table_name, Arc::new(StreamTable::new(Arc::new(config))))?; Ok(()) } + +struct BoundedStream { + limit: usize, + count: usize, + batch: RecordBatch, +} + +impl Stream for BoundedStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll> { + if self.count >= self.limit { + return Poll::Ready(None); + } + self.count += 1; + Poll::Ready(Some(Ok(self.batch.clone()))) + } +} + +impl RecordBatchStream for BoundedStream { + fn schema(&self) -> SchemaRef { + self.batch.schema() + } +} + +/// Creates an bounded stream for testing purposes. +pub fn bounded_stream(batch: RecordBatch, limit: usize) -> SendableRecordBatchStream { + Box::pin(BoundedStream { + count: 0, + limit, + batch, + }) +} diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index f61ee5d9ab98..f7402357d1c7 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -31,6 +31,7 @@ use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use futures::StreamExt; use std::any::Any; use std::sync::{Arc, OnceLock}; +use tokio::fs::File; use datafusion::datasource::streaming::StreamingTable; use datafusion::datasource::{MemTable, TableProvider}; @@ -323,6 +324,30 @@ async fn oom_recursive_cte() { .await } +#[tokio::test] +async fn oom_parquet_sink() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.into_path().join("test.parquet"); + let _ = File::create(path.clone()).await.unwrap(); + + TestCase::new() + .with_query(format!( + " + COPY (select * from t) + TO '{}' + STORED AS PARQUET OPTIONS (compression 'uncompressed'); + ", + path.to_string_lossy() + )) + .with_expected_errors(vec![ + // TODO: update error handling in ParquetSink + "Unable to send array to writer!", + ]) + .with_memory_limit(200_000) + .run() + .await +} + /// Run the query with the specified memory limit, /// and verifies the expected errors are returned #[derive(Clone, Debug)] From 32cb3c5a54bd0297d473792c8a3b0e7fd51c2e3b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 10 Jul 2024 14:21:44 -0400 Subject: [PATCH 16/19] Minor: remove clones and unnecessary Arcs in `from_substrait_rex` (#11337) --- .../substrait/src/logical_plan/consumer.rs | 146 +++++++----------- 1 file changed, 59 insertions(+), 87 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 89a6dde51e42..a4f724202475 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -411,11 +411,11 @@ pub async fn from_substrait_rel( from_substrait_rex(ctx, e, input.clone().schema(), extensions) .await?; // if the expression is WindowFunction, wrap in a Window relation - if let Expr::WindowFunction(_) = x.as_ref() { + if let Expr::WindowFunction(_) = &x { // Adding the same expression here and in the project below // works because the project's builder uses columnize_expr(..) // to transform it into a column reference - input = input.window(vec![x.as_ref().clone()])? + input = input.window(vec![x.clone()])? } // Ensure the expression has a unique display name, so that project's // validate_unique_names doesn't fail @@ -426,12 +426,12 @@ pub async fn from_substrait_rel( new_name = format!("{}__temp__{}", name, i); i += 1; } - names.insert(new_name.clone()); if new_name != name { - exprs.push(x.as_ref().clone().alias(new_name.clone())); + exprs.push(x.alias(new_name.clone())); } else { - exprs.push(x.as_ref().clone()); + exprs.push(x); } + names.insert(new_name); } input.project(exprs)?.build() } else { @@ -447,7 +447,7 @@ pub async fn from_substrait_rel( let expr = from_substrait_rex(ctx, condition, input.schema(), extensions) .await?; - input.filter(expr.as_ref().clone())?.build() + input.filter(expr)?.build() } else { not_impl_err!("Filter without an condition is not valid") } @@ -499,7 +499,7 @@ pub async fn from_substrait_rel( let x = from_substrait_rex(ctx, e, input.schema(), extensions) .await?; - group_expr.push(x.as_ref().clone()); + group_expr.push(x); } } _ => { @@ -514,7 +514,7 @@ pub async fn from_substrait_rel( extensions, ) .await?; - grouping_set.push(x.as_ref().clone()); + grouping_set.push(x); } grouping_sets.push(grouping_set); } @@ -532,9 +532,7 @@ pub async fn from_substrait_rel( let filter = match &m.filter { Some(fil) => Some(Box::new( from_substrait_rex(ctx, fil, input.schema(), extensions) - .await? - .as_ref() - .clone(), + .await?, )), None => None, }; @@ -931,7 +929,7 @@ pub async fn from_substrait_sorts( }; let (asc, nulls_first) = asc_nullfirst.unwrap(); sorts.push(Expr::Sort(Sort { - expr: Box::new(expr.as_ref().clone()), + expr: Box::new(expr), asc, nulls_first, })); @@ -949,7 +947,7 @@ pub async fn from_substrait_rex_vec( let mut expressions: Vec = vec![]; for expr in exprs { let expression = from_substrait_rex(ctx, expr, input_schema, extensions).await?; - expressions.push(expression.as_ref().clone()); + expressions.push(expression); } Ok(expressions) } @@ -969,7 +967,7 @@ pub async fn from_substrait_func_args( } _ => not_impl_err!("Function argument non-Value type not supported"), }; - args.push(arg_expr?.as_ref().clone()); + args.push(arg_expr?); } Ok(args) } @@ -1028,17 +1026,15 @@ pub async fn from_substrait_rex( e: &Expression, input_schema: &DFSchema, extensions: &HashMap, -) -> Result> { +) -> Result { match &e.rex_type { Some(RexType::SingularOrList(s)) => { let substrait_expr = s.value.as_ref().unwrap(); let substrait_list = s.options.as_ref(); - Ok(Arc::new(Expr::InList(InList { + Ok(Expr::InList(InList { expr: Box::new( from_substrait_rex(ctx, substrait_expr, input_schema, extensions) - .await? - .as_ref() - .clone(), + .await?, ), list: from_substrait_rex_vec( ctx, @@ -1048,11 +1044,11 @@ pub async fn from_substrait_rex( ) .await?, negated: false, - }))) + })) + } + Some(RexType::Selection(field_ref)) => { + Ok(from_substrait_field_reference(field_ref, input_schema)?) } - Some(RexType::Selection(field_ref)) => Ok(Arc::new( - from_substrait_field_reference(field_ref, input_schema)?, - )), Some(RexType::IfThen(if_then)) => { // Parse `ifs` // If the first element does not have a `then` part, then we can assume it's a base expression @@ -1069,9 +1065,7 @@ pub async fn from_substrait_rex( input_schema, extensions, ) - .await? - .as_ref() - .clone(), + .await?, )); continue; } @@ -1084,9 +1078,7 @@ pub async fn from_substrait_rex( input_schema, extensions, ) - .await? - .as_ref() - .clone(), + .await?, ), Box::new( from_substrait_rex( @@ -1095,27 +1087,22 @@ pub async fn from_substrait_rex( input_schema, extensions, ) - .await? - .as_ref() - .clone(), + .await?, ), )); } // Parse `else` let else_expr = match &if_then.r#else { Some(e) => Some(Box::new( - from_substrait_rex(ctx, e, input_schema, extensions) - .await? - .as_ref() - .clone(), + from_substrait_rex(ctx, e, input_schema, extensions).await?, )), None => None, }; - Ok(Arc::new(Expr::Case(Case { + Ok(Expr::Case(Case { expr, when_then_expr, else_expr, - }))) + })) } Some(RexType::ScalarFunction(f)) => { let Some(fn_name) = extensions.get(&f.function_reference) else { @@ -1133,8 +1120,9 @@ pub async fn from_substrait_rex( // try to first match the requested function into registered udfs, then built-in ops // and finally built-in expressions if let Some(func) = ctx.state().scalar_functions().get(fn_name) { - Ok(Arc::new(Expr::ScalarFunction( - expr::ScalarFunction::new_udf(func.to_owned(), args), + Ok(Expr::ScalarFunction(expr::ScalarFunction::new_udf( + func.to_owned(), + args, ))) } else if let Some(op) = name_to_op(fn_name) { if f.arguments.len() < 2 { @@ -1147,17 +1135,14 @@ pub async fn from_substrait_rex( // In those cases we iterate through all the arguments, applying the binary expression against them all let combined_expr = args .into_iter() - .fold(None, |combined_expr: Option>, arg: Expr| { + .fold(None, |combined_expr: Option, arg: Expr| { Some(match combined_expr { - Some(expr) => Arc::new(Expr::BinaryExpr(BinaryExpr { - left: Box::new( - Arc::try_unwrap(expr) - .unwrap_or_else(|arc: Arc| (*arc).clone()), - ), // Avoid cloning if possible + Some(expr) => Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr), op, right: Box::new(arg), - })), - None => Arc::new(arg), + }), + None => arg, }) }) .unwrap(); @@ -1171,10 +1156,10 @@ pub async fn from_substrait_rex( } Some(RexType::Literal(lit)) => { let scalar_value = from_substrait_literal_without_names(lit)?; - Ok(Arc::new(Expr::Literal(scalar_value))) + Ok(Expr::Literal(scalar_value)) } Some(RexType::Cast(cast)) => match cast.as_ref().r#type.as_ref() { - Some(output_type) => Ok(Arc::new(Expr::Cast(Cast::new( + Some(output_type) => Ok(Expr::Cast(Cast::new( Box::new( from_substrait_rex( ctx, @@ -1182,12 +1167,10 @@ pub async fn from_substrait_rex( input_schema, extensions, ) - .await? - .as_ref() - .clone(), + .await?, ), from_substrait_type_without_names(output_type)?, - )))), + ))), None => substrait_err!("Cast expression without output type is not allowed"), }, Some(RexType::WindowFunction(window)) => { @@ -1232,7 +1215,7 @@ pub async fn from_substrait_rex( } } }; - Ok(Arc::new(Expr::WindowFunction(expr::WindowFunction { + Ok(Expr::WindowFunction(expr::WindowFunction { fun, args: from_substrait_func_args( ctx, @@ -1255,7 +1238,7 @@ pub async fn from_substrait_rex( from_substrait_bound(&window.upper_bound, false)?, ), null_treatment: None, - }))) + })) } Some(RexType::Subquery(subquery)) => match &subquery.as_ref().subquery_type { Some(subquery_type) => match subquery_type { @@ -1270,7 +1253,7 @@ pub async fn from_substrait_rex( from_substrait_rel(ctx, haystack_expr, extensions) .await?; let outer_refs = haystack_expr.all_out_ref_exprs(); - Ok(Arc::new(Expr::InSubquery(InSubquery { + Ok(Expr::InSubquery(InSubquery { expr: Box::new( from_substrait_rex( ctx, @@ -1278,16 +1261,14 @@ pub async fn from_substrait_rex( input_schema, extensions, ) - .await? - .as_ref() - .clone(), + .await?, ), subquery: Subquery { subquery: Arc::new(haystack_expr), outer_ref_columns: outer_refs, }, negated: false, - }))) + })) } else { substrait_err!("InPredicate Subquery type must have a Haystack expression") } @@ -1301,10 +1282,10 @@ pub async fn from_substrait_rex( ) .await?; let outer_ref_columns = plan.all_out_ref_exprs(); - Ok(Arc::new(Expr::ScalarSubquery(Subquery { + Ok(Expr::ScalarSubquery(Subquery { subquery: Arc::new(plan), outer_ref_columns, - }))) + })) } SubqueryType::SetPredicate(predicate) => { match predicate.predicate_op() { @@ -1318,13 +1299,13 @@ pub async fn from_substrait_rex( ) .await?; let outer_ref_columns = plan.all_out_ref_exprs(); - Ok(Arc::new(Expr::Exists(Exists::new( + Ok(Expr::Exists(Exists::new( Subquery { subquery: Arc::new(plan), outer_ref_columns, }, false, - )))) + ))) } other_type => substrait_err!( "unimplemented type {:?} for set predicate", @@ -1337,7 +1318,7 @@ pub async fn from_substrait_rex( } }, None => { - substrait_err!("Subquery experssion without SubqueryType is not allowed") + substrait_err!("Subquery expression without SubqueryType is not allowed") } }, _ => not_impl_err!("unsupported rex_type"), @@ -2001,7 +1982,7 @@ impl BuiltinExprBuilder { f: &ScalarFunction, input_schema: &DFSchema, extensions: &HashMap, - ) -> Result> { + ) -> Result { match self.expr_name.as_str() { "like" => { Self::build_like_expr(ctx, false, f, input_schema, extensions).await @@ -2026,17 +2007,15 @@ impl BuiltinExprBuilder { f: &ScalarFunction, input_schema: &DFSchema, extensions: &HashMap, - ) -> Result> { + ) -> Result { if f.arguments.len() != 1 { return substrait_err!("Expect one argument for {fn_name} expr"); } let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { return substrait_err!("Invalid arguments type for {fn_name} expr"); }; - let arg = from_substrait_rex(ctx, expr_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); + let arg = + from_substrait_rex(ctx, expr_substrait, input_schema, extensions).await?; let arg = Box::new(arg); let expr = match fn_name { @@ -2053,7 +2032,7 @@ impl BuiltinExprBuilder { _ => return not_impl_err!("Unsupported builtin expression: {}", fn_name), }; - Ok(Arc::new(expr)) + Ok(expr) } async fn build_like_expr( @@ -2062,7 +2041,7 @@ impl BuiltinExprBuilder { f: &ScalarFunction, input_schema: &DFSchema, extensions: &HashMap, - ) -> Result> { + ) -> Result { let fn_name = if case_insensitive { "ILIKE" } else { "LIKE" }; if f.arguments.len() != 2 && f.arguments.len() != 3 { return substrait_err!("Expect two or three arguments for `{fn_name}` expr"); @@ -2071,18 +2050,13 @@ impl BuiltinExprBuilder { let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { return substrait_err!("Invalid arguments type for `{fn_name}` expr"); }; - let expr = from_substrait_rex(ctx, expr_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); + let expr = + from_substrait_rex(ctx, expr_substrait, input_schema, extensions).await?; let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else { return substrait_err!("Invalid arguments type for `{fn_name}` expr"); }; let pattern = - from_substrait_rex(ctx, pattern_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); + from_substrait_rex(ctx, pattern_substrait, input_schema, extensions).await?; // Default case: escape character is Literal(Utf8(None)) let escape_char = if f.arguments.len() == 3 { @@ -2093,9 +2067,7 @@ impl BuiltinExprBuilder { let escape_char_expr = from_substrait_rex(ctx, escape_char_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); + .await?; match escape_char_expr { Expr::Literal(ScalarValue::Utf8(escape_char_string)) => { @@ -2112,12 +2084,12 @@ impl BuiltinExprBuilder { None }; - Ok(Arc::new(Expr::Like(Like { + Ok(Expr::Like(Like { negated: false, expr: Box::new(expr), pattern: Box::new(pattern), escape_char, case_insensitive, - }))) + })) } } From cc7484e0b73fe0b36e5f76741399c95e5e7ff1c7 Mon Sep 17 00:00:00 2001 From: June <61218022+itsjunetime@users.noreply.github.com> Date: Wed, 10 Jul 2024 13:11:48 -0600 Subject: [PATCH 17/19] Minor: Change no-statement error message to be clearer (#11394) * Change no-statement error message to be clearer and add tests for said change * Run fmt to pass CI --- .../core/src/execution/session_state.rs | 2 +- datafusion/core/tests/sql/sql_api.rs | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index c123ebb22ecb..60745076c242 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -555,7 +555,7 @@ impl SessionState { } let statement = statements.pop_front().ok_or_else(|| { DataFusionError::NotImplemented( - "The context requires a statement!".to_string(), + "No SQL statements were provided in the query string".to_string(), ) })?; Ok(statement) diff --git a/datafusion/core/tests/sql/sql_api.rs b/datafusion/core/tests/sql/sql_api.rs index 4a6424fc24b6..e7c40d2c8aa8 100644 --- a/datafusion/core/tests/sql/sql_api.rs +++ b/datafusion/core/tests/sql/sql_api.rs @@ -113,6 +113,40 @@ async fn unsupported_statement_returns_error() { ctx.sql_with_options(sql, options).await.unwrap(); } +#[tokio::test] +async fn empty_statement_returns_error() { + let ctx = SessionContext::new(); + ctx.sql("CREATE TABLE test (x int)").await.unwrap(); + + let state = ctx.state(); + + // Give it an empty string which contains no statements + let plan_res = state.create_logical_plan("").await; + assert_eq!( + plan_res.unwrap_err().strip_backtrace(), + "This feature is not implemented: No SQL statements were provided in the query string" + ); +} + +#[tokio::test] +async fn multiple_statements_returns_error() { + let ctx = SessionContext::new(); + ctx.sql("CREATE TABLE test (x int)").await.unwrap(); + + let state = ctx.state(); + + // Give it a string that contains multiple statements + let plan_res = state + .create_logical_plan( + "INSERT INTO test (x) VALUES (1); INSERT INTO test (x) VALUES (2)", + ) + .await; + assert_eq!( + plan_res.unwrap_err().strip_backtrace(), + "This feature is not implemented: The context currently only supports a single SQL statement" + ); +} + #[tokio::test] async fn ddl_can_not_be_planned_by_session_state() { let ctx = SessionContext::new(); From d3f63728d222cc5cf30cf03a12ec9a0b41399b18 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Thu, 11 Jul 2024 07:32:03 +0800 Subject: [PATCH 18/19] Change `array_agg` to return `null` on no input rather than empty list (#11299) * change array agg semantic for empty result Signed-off-by: jayzhan211 * return null Signed-off-by: jayzhan211 * fix test Signed-off-by: jayzhan211 * fix order sensitive Signed-off-by: jayzhan211 * fix test Signed-off-by: jayzhan211 * add more test Signed-off-by: jayzhan211 * fix null Signed-off-by: jayzhan211 * fix multi-phase case Signed-off-by: jayzhan211 * add comment Signed-off-by: jayzhan211 * cleanup Signed-off-by: jayzhan211 * fix clone Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 --- datafusion/common/src/scalar/mod.rs | 10 ++ datafusion/core/tests/dataframe/mod.rs | 2 +- datafusion/core/tests/sql/aggregates.rs | 2 +- datafusion/expr/src/aggregate_function.rs | 2 +- .../physical-expr/src/aggregate/array_agg.rs | 17 +- .../src/aggregate/array_agg_distinct.rs | 11 +- .../src/aggregate/array_agg_ordered.rs | 12 +- .../physical-expr/src/aggregate/build_in.rs | 4 +- .../sqllogictest/test_files/aggregate.slt | 155 +++++++++++++----- 9 files changed, 161 insertions(+), 54 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index c8f21788cbbd..6c03e8698e80 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -1984,6 +1984,16 @@ impl ScalarValue { Self::new_list(values, data_type, true) } + /// Create ListArray with Null with specific data type + /// + /// - new_null_list(i32, nullable, 1): `ListArray[NULL]` + pub fn new_null_list(data_type: DataType, nullable: bool, null_len: usize) -> Self { + let data_type = DataType::List(Field::new_list_field(data_type, nullable).into()); + Self::List(Arc::new(ListArray::from(ArrayData::new_null( + &data_type, null_len, + )))) + } + /// Converts `IntoIterator` where each element has type corresponding to /// `data_type`, to a [`ListArray`]. /// diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 2d1904d9e166..f1d57c44293b 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1388,7 +1388,7 @@ async fn unnest_with_redundant_columns() -> Result<()> { let expected = vec![ "Projection: shapes.shape_id [shape_id:UInt32]", " Unnest: lists[shape_id2] structs[] [shape_id:UInt32, shape_id2:UInt32;N]", - " Aggregate: groupBy=[[shapes.shape_id]], aggr=[[ARRAY_AGG(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: \"item\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} })]", + " Aggregate: groupBy=[[shapes.shape_id]], aggr=[[ARRAY_AGG(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: \"item\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} });N]", " TableScan: shapes projection=[shape_id] [shape_id:UInt32]", ]; diff --git a/datafusion/core/tests/sql/aggregates.rs b/datafusion/core/tests/sql/aggregates.rs index e503b74992c3..86032dc9bc96 100644 --- a/datafusion/core/tests/sql/aggregates.rs +++ b/datafusion/core/tests/sql/aggregates.rs @@ -37,7 +37,7 @@ async fn csv_query_array_agg_distinct() -> Result<()> { Schema::new(vec![Field::new_list( "ARRAY_AGG(DISTINCT aggregate_test_100.c2)", Field::new("item", DataType::UInt32, false), - false + true ),]) ); diff --git a/datafusion/expr/src/aggregate_function.rs b/datafusion/expr/src/aggregate_function.rs index 23e98714dfa4..3cae78eaed9b 100644 --- a/datafusion/expr/src/aggregate_function.rs +++ b/datafusion/expr/src/aggregate_function.rs @@ -118,7 +118,7 @@ impl AggregateFunction { pub fn nullable(&self) -> Result { match self { AggregateFunction::Max | AggregateFunction::Min => Ok(true), - AggregateFunction::ArrayAgg => Ok(false), + AggregateFunction::ArrayAgg => Ok(true), } } } diff --git a/datafusion/physical-expr/src/aggregate/array_agg.rs b/datafusion/physical-expr/src/aggregate/array_agg.rs index 634a0a017903..38a973802933 100644 --- a/datafusion/physical-expr/src/aggregate/array_agg.rs +++ b/datafusion/physical-expr/src/aggregate/array_agg.rs @@ -71,7 +71,7 @@ impl AggregateExpr for ArrayAgg { &self.name, // This should be the same as return type of AggregateFunction::ArrayAgg Field::new("item", self.input_data_type.clone(), self.nullable), - false, + true, )) } @@ -86,7 +86,7 @@ impl AggregateExpr for ArrayAgg { Ok(vec![Field::new_list( format_state_name(&self.name, "array_agg"), Field::new("item", self.input_data_type.clone(), self.nullable), - false, + true, )]) } @@ -137,8 +137,11 @@ impl Accumulator for ArrayAggAccumulator { return Ok(()); } assert!(values.len() == 1, "array_agg can only take 1 param!"); + let val = Arc::clone(&values[0]); - self.values.push(val); + if val.len() > 0 { + self.values.push(val); + } Ok(()) } @@ -162,13 +165,15 @@ impl Accumulator for ArrayAggAccumulator { fn evaluate(&mut self) -> Result { // Transform Vec to ListArr - let element_arrays: Vec<&dyn Array> = self.values.iter().map(|a| a.as_ref()).collect(); if element_arrays.is_empty() { - let arr = ScalarValue::new_list(&[], &self.datatype, self.nullable); - return Ok(ScalarValue::List(arr)); + return Ok(ScalarValue::new_null_list( + self.datatype.clone(), + self.nullable, + 1, + )); } let concated_array = arrow::compute::concat(&element_arrays)?; diff --git a/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs b/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs index a59d85e84a20..368d11d7421a 100644 --- a/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs @@ -75,7 +75,7 @@ impl AggregateExpr for DistinctArrayAgg { &self.name, // This should be the same as return type of AggregateFunction::ArrayAgg Field::new("item", self.input_data_type.clone(), self.nullable), - false, + true, )) } @@ -90,7 +90,7 @@ impl AggregateExpr for DistinctArrayAgg { Ok(vec![Field::new_list( format_state_name(&self.name, "distinct_array_agg"), Field::new("item", self.input_data_type.clone(), self.nullable), - false, + true, )]) } @@ -165,6 +165,13 @@ impl Accumulator for DistinctArrayAggAccumulator { fn evaluate(&mut self) -> Result { let values: Vec = self.values.iter().cloned().collect(); + if values.is_empty() { + return Ok(ScalarValue::new_null_list( + self.datatype.clone(), + self.nullable, + 1, + )); + } let arr = ScalarValue::new_list(&values, &self.datatype, self.nullable); Ok(ScalarValue::List(arr)) } diff --git a/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs b/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs index a64d97637c3b..d44811192f66 100644 --- a/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs +++ b/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs @@ -92,7 +92,7 @@ impl AggregateExpr for OrderSensitiveArrayAgg { &self.name, // This should be the same as return type of AggregateFunction::ArrayAgg Field::new("item", self.input_data_type.clone(), self.nullable), - false, + true, )) } @@ -111,7 +111,7 @@ impl AggregateExpr for OrderSensitiveArrayAgg { let mut fields = vec![Field::new_list( format_state_name(&self.name, "array_agg"), Field::new("item", self.input_data_type.clone(), self.nullable), - false, // This should be the same as field() + true, // This should be the same as field() )]; let orderings = ordering_fields(&self.ordering_req, &self.order_by_data_types); fields.push(Field::new_list( @@ -309,6 +309,14 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { } fn evaluate(&mut self) -> Result { + if self.values.is_empty() { + return Ok(ScalarValue::new_null_list( + self.datatypes[0].clone(), + self.nullable, + 1, + )); + } + let values = self.values.clone(); let array = if self.reverse { ScalarValue::new_list_from_iter( diff --git a/datafusion/physical-expr/src/aggregate/build_in.rs b/datafusion/physical-expr/src/aggregate/build_in.rs index d4cd3d51d174..68c9b4859f1f 100644 --- a/datafusion/physical-expr/src/aggregate/build_in.rs +++ b/datafusion/physical-expr/src/aggregate/build_in.rs @@ -147,7 +147,7 @@ mod tests { Field::new_list( "c1", Field::new("item", data_type.clone(), true), - false, + true, ), result_agg_phy_exprs.field().unwrap() ); @@ -167,7 +167,7 @@ mod tests { Field::new_list( "c1", Field::new("item", data_type.clone(), true), - false, + true, ), result_agg_phy_exprs.field().unwrap() ); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index e891093c8156..7dd1ea82b327 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1694,7 +1694,7 @@ SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT query ? SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 LIMIT 0) test ---- -[] +NULL # csv_query_array_agg_one query ? @@ -1753,31 +1753,12 @@ NULL 4 29 1.260869565217 123 -117 23 NULL 5 -194 -13.857142857143 118 -101 14 NULL NULL 781 7.81 125 -117 100 -# TODO: array_agg_distinct output is non-deterministic -- rewrite with array_sort(list_sort) -# unnest is also not available, so manually unnesting via CROSS JOIN -# additional count(1) forces array_agg_distinct instead of array_agg over aggregated by c2 data -# +# select with count to forces array_agg_distinct function, since single distinct expression is converted to group by by optimizer # csv_query_array_agg_distinct -query III -WITH indices AS ( - SELECT 1 AS idx UNION ALL - SELECT 2 AS idx UNION ALL - SELECT 3 AS idx UNION ALL - SELECT 4 AS idx UNION ALL - SELECT 5 AS idx -) -SELECT data.arr[indices.idx] as element, array_length(data.arr) as array_len, dummy -FROM ( - SELECT array_agg(distinct c2) as arr, count(1) as dummy FROM aggregate_test_100 -) data - CROSS JOIN indices -ORDER BY 1 ----- -1 5 100 -2 5 100 -3 5 100 -4 5 100 -5 5 100 +query ?I +SELECT array_sort(array_agg(distinct c2)), count(1) FROM aggregate_test_100 +---- +[1, 2, 3, 4, 5] 100 # aggregate_time_min_and_max query TT @@ -2732,6 +2713,16 @@ SELECT COUNT(DISTINCT c1) FROM test # TODO: aggregate_with_alias +# test_approx_percentile_cont_decimal_support +query TI +SELECT c1, approx_percentile_cont(c2, cast(0.85 as decimal(10,2))) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +---- +a 4 +b 5 +c 4 +d 4 +e 4 + # array_agg_zero query ? SELECT ARRAY_AGG([]) @@ -2744,28 +2735,114 @@ SELECT ARRAY_AGG([1]) ---- [[1]] -# test_approx_percentile_cont_decimal_support -query TI -SELECT c1, approx_percentile_cont(c2, cast(0.85 as decimal(10,2))) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1 +# test array_agg with no row qualified +statement ok +create table t(a int, b float, c bigint) as values (1, 1.2, 2); + +# returns NULL, follows DuckDB's behaviour +query ? +select array_agg(a) from t where a > 2; ---- -a 4 -b 5 -c 4 -d 4 -e 4 +NULL +query ? +select array_agg(b) from t where b > 3.1; +---- +NULL -# array_agg_zero query ? -SELECT ARRAY_AGG([]); +select array_agg(c) from t where c > 3; ---- -[[]] +NULL -# array_agg_one +query ?I +select array_agg(c), count(1) from t where c > 3; +---- +NULL 0 + +# returns 0 rows if group by is applied, follows DuckDB's behaviour query ? -SELECT ARRAY_AGG([1]); +select array_agg(a) from t where a > 3 group by a; ---- -[[1]] + +query ?I +select array_agg(a), count(1) from t where a > 3 group by a; +---- + +# returns NULL, follows DuckDB's behaviour +query ? +select array_agg(distinct a) from t where a > 3; +---- +NULL + +query ?I +select array_agg(distinct a), count(1) from t where a > 3; +---- +NULL 0 + +# returns 0 rows if group by is applied, follows DuckDB's behaviour +query ? +select array_agg(distinct a) from t where a > 3 group by a; +---- + +query ?I +select array_agg(distinct a), count(1) from t where a > 3 group by a; +---- + +# test order sensitive array agg +query ? +select array_agg(a order by a) from t where a > 3; +---- +NULL + +query ? +select array_agg(a order by a) from t where a > 3 group by a; +---- + +query ?I +select array_agg(a order by a), count(1) from t where a > 3 group by a; +---- + +statement ok +drop table t; + +# test with no values +statement ok +create table t(a int, b float, c bigint); + +query ? +select array_agg(a) from t; +---- +NULL + +query ? +select array_agg(b) from t; +---- +NULL + +query ? +select array_agg(c) from t; +---- +NULL + +query ?I +select array_agg(distinct a), count(1) from t; +---- +NULL 0 + +query ?I +select array_agg(distinct b), count(1) from t; +---- +NULL 0 + +query ?I +select array_agg(distinct b), count(1) from t; +---- +NULL 0 + +statement ok +drop table t; + # array_agg_i32 statement ok From 7a23ea9bce32dc8ae195caa8ca052673031c06c9 Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Thu, 11 Jul 2024 09:38:15 +0800 Subject: [PATCH 19/19] Minor: return "not supported" for `COUNT DISTINCT` with multiple arguments (#11391) * Minor: return "not supported" for COUNT DISTINCT with multiple arguments * update condition --- datafusion/functions-aggregate/src/count.rs | 6 +++++- datafusion/sqllogictest/test_files/aggregate.slt | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index bd0155df0271..0a667d35dce5 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -37,7 +37,7 @@ use arrow::{ buffer::BooleanBuffer, }; use datafusion_common::{ - downcast_value, internal_err, DataFusionError, Result, ScalarValue, + downcast_value, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, }; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ @@ -138,6 +138,10 @@ impl AggregateUDFImpl for Count { return Ok(Box::new(CountAccumulator::new())); } + if acc_args.input_exprs.len() > 1 { + return not_impl_err!("COUNT DISTINCT with multiple arguments"); + } + let data_type = acc_args.input_type; Ok(match data_type { // try and use a specialized accumulator if possible, otherwise fall back to generic accumulator diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 7dd1ea82b327..6fafc0a74110 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -2019,6 +2019,10 @@ SELECT count(c1, c2) FROM test ---- 3 +# count(distinct) with multiple arguments +query error DataFusion error: This feature is not implemented: COUNT DISTINCT with multiple arguments +SELECT count(distinct c1, c2) FROM test + # count_null query III SELECT count(null), count(null, null), count(distinct null) FROM test