diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 059f86ce110f..d9bebbdc0800 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -1997,7 +1997,7 @@ mod tests { assert_contains!( &display, - "pruning_predicate=CASE WHEN c1_null_count@2 = c1_row_count@3 THEN false ELSE c1_min@0 != bar OR bar != c1_max@1 END" + "pruning_predicate=c1_null_count@2 != c1_row_count@3 AND (c1_min@0 != bar OR bar != c1_max@1)" ); assert_contains!(&display, r#"predicate=c1@0 != bar"#); diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index eb03b337779c..368a4974eabe 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -290,7 +290,12 @@ pub trait PruningStatistics { /// predicate can never possibly be true). The container can be pruned (skipped) /// entirely. /// -/// Note that in order to be correct, `PruningPredicate` must return false +/// While `PruningPredicate` will never return a `NULL` value, the +/// rewritten predicate (as returned by `build_predicate_expression` and used internally +/// by `PruningPredicate`) may evaluate to `NULL` when some of the min/max values +/// or null / row counts are not known. +/// +/// In order to be correct, `PruningPredicate` must return false /// **only** if it can determine that for all rows in the container, the /// predicate could never evaluate to `true` (always evaluates to either `NULL` /// or `false`). @@ -330,12 +335,12 @@ pub trait PruningStatistics { /// /// Original Predicate | Rewritten Predicate /// ------------------ | -------------------- -/// `x = 5` | `CASE WHEN x_null_count = x_row_count THEN false ELSE x_min <= 5 AND 5 <= x_max END` -/// `x < 5` | `CASE WHEN x_null_count = x_row_count THEN false ELSE x_max < 5 END` -/// `x = 5 AND y = 10` | `CASE WHEN x_null_count = x_row_count THEN false ELSE x_min <= 5 AND 5 <= x_max END AND CASE WHEN y_null_count = y_row_count THEN false ELSE y_min <= 10 AND 10 <= y_max END` +/// `x = 5` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max)` +/// `x < 5` | `x_null_count != x_row_count THEN false (x_max < 5)` +/// `x = 5 AND y = 10` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) AND y_null_count != y_row_count (y_min <= 10 AND 10 <= y_max)` /// `x IS NULL` | `x_null_count > 0` /// `x IS NOT NULL` | `x_null_count != row_count` -/// `CAST(x as int) = 5` | `CASE WHEN x_null_count = x_row_count THEN false ELSE CAST(x_min as int) <= 5 AND 5 <= CAST(x_max as int) END` +/// `CAST(x as int) = 5` | `x_null_count != x_row_count (CAST(x_min as int) <= 5 AND 5 <= CAST(x_max as int))` /// /// ## Predicate Evaluation /// The PruningPredicate works in two passes @@ -355,15 +360,9 @@ pub trait PruningStatistics { /// Given the predicate, `x = 5 AND y = 10`, the rewritten predicate would look like: /// /// ```sql -/// CASE -/// WHEN x_null_count = x_row_count THEN false -/// ELSE x_min <= 5 AND 5 <= x_max -/// END +/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) /// AND -/// CASE -/// WHEN y_null_count = y_row_count THEN false -/// ELSE y_min <= 10 AND 10 <= y_max -/// END +/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) /// ``` /// /// If we know that for a given container, `x` is between `1 and 100` and we know that @@ -384,16 +383,22 @@ pub trait PruningStatistics { /// When these statistics values are substituted in to the rewritten predicate and /// simplified, the result is `false`: /// -/// * `CASE WHEN null = null THEN false ELSE 1 <= 5 AND 5 <= 100 END AND CASE WHEN null = null THEN false ELSE 4 <= 10 AND 10 <= 7 END` -/// * `null = null` is `null` which is not true, so the `CASE` expression will use the `ELSE` clause -/// * `1 <= 5 AND 5 <= 100 AND 4 <= 10 AND 10 <= 7` -/// * `true AND true AND true AND false` +/// * `null != null AND (1 <= 5 AND 5 <= 100) AND null != null AND (4 <= 10 AND 10 <= 7)` +/// * `null = null` is `null` which is not true, so the AND moves on to the next clause +/// * `null and (1 <= 5 AND 5 <= 100) AND null AND (4 <= 10 AND 10 <= 7)` +/// * evaluating the clauses further we get: +/// * `null and true and null and false` +/// * `null and false` /// * `false` /// /// Returning `false` means the container can be pruned, which matches the /// intuition that `x = 5 AND y = 10` can’t be true for any row if all values of `y` /// are `7` or less. /// +/// Note that if we had ended up with `null AND true AND null AND true` the result +/// would have been `null`. +/// `null` is treated the same as`true`, because we can't prove that the predicate is `false.` +/// /// If, for some other container, we knew `y` was between the values `4` and /// `15`, then the rewritten predicate evaluates to `true` (verifying this is /// left as an exercise to the reader -- are you still here?), and the container @@ -408,15 +413,9 @@ pub trait PruningStatistics { /// look like the same as example 1: /// /// ```sql -/// CASE -/// WHEN x_null_count = x_row_count THEN false -/// ELSE x_min <= 5 AND 5 <= x_max -/// END +/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) /// AND -/// CASE -/// WHEN y_null_count = y_row_count THEN false -/// ELSE y_min <= 10 AND 10 <= y_max -/// END +/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) /// ``` /// /// If we know that for another given container, `x_min` is NULL and `x_max` is @@ -438,14 +437,13 @@ pub trait PruningStatistics { /// When these statistics values are substituted in to the rewritten predicate and /// simplified, the result is `false`: /// -/// * `CASE WHEN 100 = 100 THEN false ELSE null <= 5 AND 5 <= null END AND CASE WHEN null = null THEN false ELSE 4 <= 10 AND 10 <= 7 END` -/// * Since `100 = 100` is `true`, the `CASE` expression will use the `THEN` clause, i.e. `false` -/// * The other `CASE` expression will use the `ELSE` clause, i.e. `4 <= 10 AND 10 <= 7` -/// * `false AND true` +/// * `100 != 100 AND (null <= 5 AND 5 <= null) AND null = null AND (4 <= 10 AND 10 <= 7)` +/// * `false AND null AND null AND false` +/// * `false AND false` /// * `false` /// /// Returning `false` means the container can be pruned, which matches the -/// intuition that `x = 5 AND y = 10` can’t be true for all values in `x` +/// intuition that `x = 5 AND y = 10` can’t be true because all values in `x` /// are known to be NULL. /// /// # Related Work @@ -1606,13 +1604,15 @@ fn build_statistics_expr( ); } }; - let statistics_expr = wrap_case_expr(statistics_expr, expr_builder)?; + let statistics_expr = wrap_null_count_check_expr(statistics_expr, expr_builder)?; Ok(statistics_expr) } -/// Wrap the statistics expression in a case expression. -/// This is necessary to handle the case where the column is known -/// to be all nulls. +/// Wrap the statistics expression in a check that skips the expression if the column is all nulls. +/// This is important not only as an optimization but also because statistics may not be +/// accurate for columns that are all nulls. +/// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics +/// might be set to 0 and evaluating `x = 0` would incorrectly include the column. /// /// For example: /// @@ -1621,33 +1621,29 @@ fn build_statistics_expr( /// will become /// /// ```sql -/// CASE -/// WHEN x_null_count = x_row_count THEN false -/// ELSE x_min <= 10 AND 10 <= x_max -/// END +/// NOT (x_null_count = x_row_count) AND (x_min <= 10 AND 10 <= x_max) /// ```` /// /// If the column is known to be all nulls, then the expression /// `x_null_count = x_row_count` will be true, which will cause the -/// case expression to return false. Therefore, prune out the container. -fn wrap_case_expr( +/// boolean expression to return false. Therefore, prune out the container. +fn wrap_null_count_check_expr( statistics_expr: Arc, expr_builder: &mut PruningExpressionBuilder, ) -> Result> { // x_null_count = x_row_count - let when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new( + let not_when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new( expr_builder.null_count_column_expr()?, - Operator::Eq, + Operator::NotEq, expr_builder.row_count_column_expr()?, )); - let then = Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(false)))); - - // CASE WHEN x_null_count = x_row_count THEN false ELSE END - Ok(Arc::new(phys_expr::CaseExpr::try_new( - None, - vec![(when_null_count_eq_row_count, then)], - Some(statistics_expr), - )?)) + + // (x_null_count != x_row_count) AND () + Ok(Arc::new(phys_expr::BinaryExpr::new( + not_when_null_count_eq_row_count, + Operator::And, + statistics_expr, + ))) } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -2056,6 +2052,110 @@ mod tests { } } + #[test] + fn prune_all_rows_null_counts() { + // if null_count = row_count then we should prune the container for i = 0 + // regardless of the statistics + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(0)], // min + vec![Some(0)], // max + ) + .with_null_counts(vec![Some(1)]) + .with_row_counts(vec![Some(1)]), + ); + let expected_ret = &[false]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + + // this should be true even if the container stats are missing + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![None]))), + max: Some(Arc::new(Int32Array::from(vec![None]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[false]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + + // If the null counts themselves are missing we should be able to fall back to the stats + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![None]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + + // Same for the row counts + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![None]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + } + + #[test] + fn prune_missing_statistics() { + // If the min or max stats are missing we should not prune + // (unless we know all rows are null, see `prune_all_rows_null_counts`) + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![None, Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0), None]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(0), Some(0)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1), Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true, true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false, true]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[true, false]; + prune_with_expr(col("i").lt(lit(0)), &schema, &statistics, expected_ret); + } + + #[test] + fn prune_null_stats() { + // if null_count = row_count then we should prune the container for i = 0 + // regardless of the statistics + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(0)], // min + vec![Some(0)], // max + ) + .with_null_counts(vec![Some(1)]) + .with_row_counts(vec![Some(1)]), + ); + + let expected_ret = &[false]; + + // i = 0 + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + } + #[test] fn test_build_statistics_record_batch() { // Request a record batch with of s1_min, s2_max, s3_max, s3_min @@ -2237,7 +2337,8 @@ mod tests { #[test] fn row_group_predicate_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "CASE WHEN c1_null_count@2 = c1_row_count@3 THEN false ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 END"; + let expected_expr = + "c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1"; // test column on the left let expr = col("c1").eq(lit(1)); @@ -2257,7 +2358,8 @@ mod tests { #[test] fn row_group_predicate_not_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "CASE WHEN c1_null_count@2 = c1_row_count@3 THEN false ELSE c1_min@0 != 1 OR 1 != c1_max@1 END"; + let expected_expr = + "c1_null_count@2 != c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)"; // test column on the left let expr = col("c1").not_eq(lit(1)); @@ -2277,8 +2379,7 @@ mod tests { #[test] fn row_group_predicate_gt() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = - "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_max@0 > 1 END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_max@0 > 1"; // test column on the left let expr = col("c1").gt(lit(1)); @@ -2298,7 +2399,7 @@ mod tests { #[test] fn row_group_predicate_gt_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_max@0 >= 1 END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_max@0 >= 1"; // test column on the left let expr = col("c1").gt_eq(lit(1)); @@ -2317,8 +2418,7 @@ mod tests { #[test] fn row_group_predicate_lt() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = - "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < 1 END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_min@0 < 1"; // test column on the left let expr = col("c1").lt(lit(1)); @@ -2338,7 +2438,7 @@ mod tests { #[test] fn row_group_predicate_lt_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 <= 1 END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_min@0 <= 1"; // test column on the left let expr = col("c1").lt_eq(lit(1)); @@ -2363,8 +2463,7 @@ mod tests { ]); // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); - let expected_expr = - "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < 1 END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_min@0 < 1"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -2430,7 +2529,7 @@ mod tests { #[test] fn row_group_predicate_lt_bool() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); - let expected_expr = "CASE WHEN c1_null_count@1 = c1_row_count@2 THEN false ELSE c1_min@0 < true END"; + let expected_expr = "c1_null_count@1 != c1_row_count@2 AND c1_min@0 < true"; // DF doesn't support arithmetic on boolean columns so // this predicate will error when evaluated @@ -2453,20 +2552,11 @@ mod tests { let expr = col("c1") .lt(lit(1)) .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); - let expected_expr = "\ - CASE \ - WHEN c1_null_count@1 = c1_row_count@2 THEN false \ - ELSE c1_min@0 < 1 \ - END \ - AND (\ - CASE \ - WHEN c2_null_count@5 = c2_row_count@6 THEN false \ - ELSE c2_min@3 <= 2 AND 2 <= c2_max@4 \ - END \ - OR CASE \ - WHEN c2_null_count@5 = c2_row_count@6 THEN false \ - ELSE c2_min@3 <= 3 AND 3 <= c2_max@4 \ - END\ + let expected_expr = "c1_null_count@1 != c1_row_count@2 \ + AND c1_min@0 < 1 AND (\ + c2_null_count@5 != c2_row_count@6 \ + AND c2_min@3 <= 2 AND 2 <= c2_max@4 OR \ + c2_null_count@5 != c2_row_count@6 AND c2_min@3 <= 3 AND 3 <= c2_max@4\ )"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut required_columns); @@ -2558,18 +2648,7 @@ mod tests { vec![lit(1), lit(2), lit(3)], false, )); - let expected_expr = "CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 \ - END \ - OR CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 <= 2 AND 2 <= c1_max@1 \ - END \ - OR CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 <= 3 AND 3 <= c1_max@1 \ - END"; + let expected_expr = "c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1 OR c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 3 AND 3 <= c1_max@1"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -2605,19 +2684,7 @@ mod tests { vec![lit(1), lit(2), lit(3)], true, )); - let expected_expr = "\ - CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 != 1 OR 1 != c1_max@1 \ - END \ - AND CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 != 2 OR 2 != c1_max@1 \ - END \ - AND CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 != 3 OR 3 != c1_max@1 \ - END"; + let expected_expr = "c1_null_count@2 != c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1) AND c1_null_count@2 != c1_row_count@3 AND (c1_min@0 != 2 OR 2 != c1_max@1) AND c1_null_count@2 != c1_row_count@3 AND (c1_min@0 != 3 OR 3 != c1_max@1)"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -2663,24 +2730,7 @@ mod tests { // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 let expr3 = expr1.and(expr2); - let expected_expr = "\ - (\ - CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 <= 1 AND 1 <= c1_max@1 \ - END \ - OR CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE c1_min@0 <= 2 AND 2 <= c1_max@1 \ - END\ - ) AND CASE \ - WHEN c2_null_count@5 = c2_row_count@6 THEN false \ - ELSE c2_max@4 >= 4 \ - END \ - AND CASE \ - WHEN c2_null_count@5 = c2_row_count@6 THEN false \ - ELSE c2_min@7 <= 5 \ - END"; + let expected_expr = "(c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != c1_row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1) AND c2_null_count@5 != c2_row_count@6 AND c2_max@4 >= 4 AND c2_null_count@5 != c2_row_count@6 AND c2_min@7 <= 5"; let predicate_expr = test_build_predicate_expression(&expr3, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -2707,10 +2757,7 @@ mod tests { #[test] fn row_group_predicate_cast() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) \ - END"; + let expected_expr = "c1_null_count@2 != c1_row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64)"; // test cast(c1 as int64) = 1 // test column on the left @@ -2725,10 +2772,8 @@ mod tests { test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); - let expected_expr = "CASE \ - WHEN c1_null_count@1 = c1_row_count@2 THEN false \ - ELSE TRY_CAST(c1_max@0 AS Int64) > 1 \ - END"; + let expected_expr = + "c1_null_count@1 != c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1"; // test column on the left let expr = @@ -2760,18 +2805,7 @@ mod tests { ], false, )); - let expected_expr = "CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) \ - END \ - OR CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) <= 2 AND 2 <= CAST(c1_max@1 AS Int64) \ - END \ - OR CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) <= 3 AND 3 <= CAST(c1_max@1 AS Int64) \ - END"; + let expected_expr = "c1_null_count@2 != c1_row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != c1_row_count@3 AND CAST(c1_min@0 AS Int64) <= 2 AND 2 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != c1_row_count@3 AND CAST(c1_min@0 AS Int64) <= 3 AND 3 <= CAST(c1_max@1 AS Int64)"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -2785,18 +2819,7 @@ mod tests { ], true, )); - let expected_expr = "CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) != 1 OR 1 != CAST(c1_max@1 AS Int64) \ - END \ - AND CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) != 2 OR 2 != CAST(c1_max@1 AS Int64) \ - END \ - AND CASE \ - WHEN c1_null_count@2 = c1_row_count@3 THEN false \ - ELSE CAST(c1_min@0 AS Int64) != 3 OR 3 != CAST(c1_max@1 AS Int64) \ - END"; + let expected_expr = "c1_null_count@2 != c1_row_count@3 AND (CAST(c1_min@0 AS Int64) != 1 OR 1 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != c1_row_count@3 AND (CAST(c1_min@0 AS Int64) != 2 OR 2 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != c1_row_count@3 AND (CAST(c1_min@0 AS Int64) != 3 OR 3 != CAST(c1_max@1 AS Int64))"; let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 24ffb963bbe2..806886b07170 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -85,7 +85,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], predicate=b@1 > 2, pruning_predicate=CASE WHEN b_null_count@1 = b_row_count@2 THEN false ELSE b_max@0 > 2 END, required_guarantees=[] +03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], predicate=b@1 > 2, pruning_predicate=b_null_count@1 != b_row_count@2 AND b_max@0 > 2, required_guarantees=[] # When filter pushdown *is* enabled, ParquetExec can filter exactly, @@ -113,7 +113,7 @@ physical_plan 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: b@1 > 2, projection=[a@0] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 -06)----------ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], predicate=b@1 > 2, pruning_predicate=CASE WHEN b_null_count@1 = b_row_count@2 THEN false ELSE b_max@0 > 2 END, required_guarantees=[] +06)----------ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], predicate=b@1 > 2, pruning_predicate=b_null_count@1 != b_row_count@2 AND b_max@0 > 2, required_guarantees=[] # also test querying on columns that are not in all the files query T @@ -131,7 +131,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=CASE WHEN b_null_count@1 = b_row_count@2 THEN false ELSE b_max@0 > 2 END AND a_null_count@4 != a_row_count@3, required_guarantees=[] +03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=b_null_count@1 != b_row_count@2 AND b_max@0 > 2 AND a_null_count@4 != a_row_count@3, required_guarantees=[] query I @@ -148,7 +148,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [b@0 ASC NULLS LAST] 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[b], predicate=a@0 = bar, pruning_predicate=CASE WHEN a_null_count@2 = a_row_count@3 THEN false ELSE a_min@0 <= bar AND bar <= a_max@1 END, required_guarantees=[a in (bar)] +03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[b], predicate=a@0 = bar, pruning_predicate=a_null_count@2 != a_row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)] ## cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 858e42106221..a1db84b87850 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:174..342, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..180], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:180..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:174..342, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..180], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:180..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..171], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..175], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:175..351], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:171..342]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..171], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..175], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:175..351], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:171..342]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != column1_row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 4a2d9e1d6864..a19d3f2cbecb 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -5046,3 +5046,84 @@ statement ok DROP TABLE t1; ## end test handle NULL of lead + +# row_number on bool column +query II +select b, row_number() over (order by a) from (select TRUE as a, 1 as b); +---- +1 1 + +# test window functions on boolean columns +statement count 0 +create table t1 (id int, bool_col boolean) as values + (1, true), + (2, false), + (3, true), + (4, false); + +query I +select row_number() over (order by bool_col) from t1; +---- +1 +2 +3 +4 + +query I +select rank() over (order by bool_col) from t1; +---- +1 +1 +3 +3 + +query I +select dense_rank() over (order by bool_col) from t1; +---- +1 +1 +2 +2 + +query R +select cume_dist() over (order by bool_col) from t1; +---- +0.5 +0.5 +1 +1 + +query R +select percent_rank() over (order by bool_col) from t1; +---- +0 +0 +0.666666666667 +0.666666666667 + +query I +select ntile(2) over (order by bool_col) from t1; +---- +1 +1 +2 +2 + +query IIIRRI +select + row_number() over (order by bool_col) as row_num, + rank() over (order by bool_col) as rank, + dense_rank() over (order by bool_col) as dense_rank, + cume_dist() over (order by bool_col) as cume_dist, + percent_rank() over (order by bool_col) as percent_rank, + ntile(2) over (order by bool_col) as ntile +from t1 +order by id; +---- +3 3 2 1 0.666666666667 2 +1 1 1 0.5 0 1 +4 3 2 1 0.666666666667 2 +2 1 1 0.5 0 1 + +statement ok +drop table t1;