From 7d6cd13d55d7b63c74a2cb058a55293c2dcd9797 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 11 Aug 2024 07:11:45 -0700 Subject: [PATCH] fix: throw error on sub-day generate_series increments (#11907) * fix: throw error on sub-day generate_series increments * refactor: avoid `loop` * Add a few more tests * Update datafusion/functions-nested/src/range.rs Co-authored-by: Andrew Lamb * refactor: tweak from feedback * fix: fix dup rows --------- Co-authored-by: Andrew Lamb --- datafusion/functions-nested/src/range.rs | 76 ++++++++++++-------- datafusion/sqllogictest/test_files/array.slt | 18 ++++- 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 269eaa560230..5b7315719631 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -18,13 +18,11 @@ //! [`ScalarUDFImpl`] definitions for range and gen_series functions. use crate::utils::make_scalar_function; -use arrow::array::{Array, ArrayRef, Int64Array, ListArray}; +use arrow::array::{Array, ArrayRef, Date32Builder, Int64Array, ListArray, ListBuilder}; use arrow::datatypes::{DataType, Field}; use arrow_array::types::{Date32Type, IntervalMonthDayNanoType}; -use arrow_array::{Date32Array, NullArray}; -use arrow_buffer::{ - BooleanBufferBuilder, IntervalMonthDayNano, NullBuffer, OffsetBuffer, -}; +use arrow_array::NullArray; +use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer}; use arrow_schema::DataType::{Date32, Int64, Interval, List}; use arrow_schema::IntervalUnit::MonthDayNano; use datafusion_common::cast::{as_date32_array, as_int64_array, as_interval_mdn_array}; @@ -33,6 +31,7 @@ use datafusion_expr::{ ColumnarValue, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use std::any::Any; +use std::iter::from_fn; use std::sync::Arc; make_udf_expr_and_func!( @@ -166,8 +165,11 @@ impl ScalarUDFImpl for GenSeries { match args[0].data_type() { Int64 => make_scalar_function(|args| gen_range_inner(args, true))(args), Date32 => make_scalar_function(|args| gen_range_date(args, true))(args), - _ => { - exec_err!("unsupported type for range") + dt => { + exec_err!( + "unsupported type for range. Expected Int64 or Date32, got: {}", + dt + ) } } } @@ -311,39 +313,53 @@ fn gen_range_date(args: &[ArrayRef], include_upper: bool) -> Result { Some(as_interval_mdn_array(&args[2])?), ); - let mut values = vec![]; - let mut offsets = vec![0]; + // values are date32s + let values_builder = Date32Builder::new(); + let mut list_builder = ListBuilder::new(values_builder); + for (idx, stop) in stop_array.iter().enumerate() { let mut stop = stop.unwrap_or(0); - let start = start_array.as_ref().map(|x| x.value(idx)).unwrap_or(0); - let step = step_array.as_ref().map(|arr| arr.value(idx)).unwrap_or( - IntervalMonthDayNano { - months: 0, - days: 0, - nanoseconds: 1, - }, - ); + + let start = if let Some(start_array_values) = start_array { + start_array_values.value(idx) + } else { + list_builder.append_null(); + continue; + }; + + let step = if let Some(step) = step_array { + step.value(idx) + } else { + list_builder.append_null(); + continue; + }; + let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); + + if months == 0 && days == 0 { + return exec_err!("Cannot generate date range less than 1 day."); + } + let neg = months < 0 || days < 0; if !include_upper { stop = Date32Type::subtract_month_day_nano(stop, step); } let mut new_date = start; - loop { - if neg && new_date < stop || !neg && new_date > stop { - break; + + let values = from_fn(|| { + if (neg && new_date < stop) || (!neg && new_date > stop) { + None + } else { + let current_date = new_date; + new_date = Date32Type::add_month_day_nano(new_date, step); + Some(Some(current_date)) } - values.push(new_date); - new_date = Date32Type::add_month_day_nano(new_date, step); - } - offsets.push(values.len() as i32); + }); + + list_builder.append_value(values); } - let arr = Arc::new(ListArray::try_new( - Arc::new(Field::new("item", Date32, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(Date32Array::from(values)), - None, - )?); + let arr = Arc::new(list_builder.finish()); + Ok(arr) } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 9e34db8f8dc2..b97ecced57e3 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1971,7 +1971,7 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), co # `from` may be larger than `to` and `stride` is positive query ???? select array_slice(a, -1, 2, 1), array_slice(a, -1, 2), - array_slice(a, 3, 2, 1), array_slice(a, 3, 2) + array_slice(a, 3, 2, 1), array_slice(a, 3, 2) from (values ([1.0, 2.0, 3.0, 3.0]), ([4.0, 5.0, 3.0]), ([6.0])) t(a); ---- [] [] [] [] @@ -5711,7 +5711,7 @@ select # Test range for other edge cases query ???????? -select +select range(9223372036854775807, 9223372036854775807, -1) as c1, range(9223372036854775807, 9223372036854775806, -1) as c2, range(9223372036854775807, 9223372036854775807, 1) as c3, @@ -5787,6 +5787,9 @@ select range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR) ---- [] +query error DataFusion error: Execution error: Cannot generate date range less than 1 day\. +select range(DATE '1993-03-01', DATE '1993-03-01', INTERVAL '1' HOUR) + query ????????? select generate_series(5), generate_series(2, 5), @@ -5801,6 +5804,9 @@ select generate_series(5), ---- [0, 1, 2, 3, 4, 5] [2, 3, 4, 5] [2, 5, 8] [1, 2, 3, 4, 5] [5, 4, 3, 2, 1] [10, 7, 4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01, 1993-03-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01] [1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01] +query error DataFusion error: Execution error: unsupported type for range. Expected Int64 or Date32, got: Timestamp\(Nanosecond, None\) +select generate_series('2021-01-01'::timestamp, '2021-01-02'::timestamp, INTERVAL '1' HOUR); + ## should return NULL query ? select generate_series(DATE '1992-09-01', NULL, INTERVAL '1' YEAR); @@ -5832,6 +5838,12 @@ select generate_series(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR) ---- [] +query error DataFusion error: Execution error: Cannot generate date range less than 1 day. +select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '1' HOUR) + +query error DataFusion error: Execution error: Cannot generate date range less than 1 day. +select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '-1' HOUR) + # Test generate_series with zero step query error DataFusion error: Execution error: step can't be 0 for function generate_series\(start \[, stop, step\]\) select generate_series(1, 1, 0); @@ -5849,7 +5861,7 @@ select # Test generate_series for other edge cases query ???? -select +select generate_series(9223372036854775807, 9223372036854775807, -1) as c1, generate_series(9223372036854775807, 9223372036854775807, 1) as c2, generate_series(-9223372036854775808, -9223372036854775808, -1) as c3,