Skip to content

Commit

Permalink
fix: throw error on sub-day generate_series increments (apache#11907)
Browse files Browse the repository at this point in the history
* fix: throw error on sub-day generate_series increments

* refactor: avoid `loop`

* Add a few more tests

* Update datafusion/functions-nested/src/range.rs

Co-authored-by: Andrew Lamb <[email protected]>

* refactor: tweak from feedback

* fix: fix dup rows

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
tshauck and alamb authored Aug 11, 2024
1 parent 64a9280 commit 7d6cd13
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 33 deletions.
76 changes: 46 additions & 30 deletions datafusion/functions-nested/src/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@
//! [`ScalarUDFImpl`] definitions for range and gen_series functions.
use crate::utils::make_scalar_function;
use arrow::array::{Array, ArrayRef, Int64Array, ListArray};
use arrow::array::{Array, ArrayRef, Date32Builder, Int64Array, ListArray, ListBuilder};
use arrow::datatypes::{DataType, Field};
use arrow_array::types::{Date32Type, IntervalMonthDayNanoType};
use arrow_array::{Date32Array, NullArray};
use arrow_buffer::{
BooleanBufferBuilder, IntervalMonthDayNano, NullBuffer, OffsetBuffer,
};
use arrow_array::NullArray;
use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer};
use arrow_schema::DataType::{Date32, Int64, Interval, List};
use arrow_schema::IntervalUnit::MonthDayNano;
use datafusion_common::cast::{as_date32_array, as_int64_array, as_interval_mdn_array};
Expand All @@ -33,6 +31,7 @@ use datafusion_expr::{
ColumnarValue, ScalarUDFImpl, Signature, TypeSignature, Volatility,
};
use std::any::Any;
use std::iter::from_fn;
use std::sync::Arc;

make_udf_expr_and_func!(
Expand Down Expand Up @@ -166,8 +165,11 @@ impl ScalarUDFImpl for GenSeries {
match args[0].data_type() {
Int64 => make_scalar_function(|args| gen_range_inner(args, true))(args),
Date32 => make_scalar_function(|args| gen_range_date(args, true))(args),
_ => {
exec_err!("unsupported type for range")
dt => {
exec_err!(
"unsupported type for range. Expected Int64 or Date32, got: {}",
dt
)
}
}
}
Expand Down Expand Up @@ -311,39 +313,53 @@ fn gen_range_date(args: &[ArrayRef], include_upper: bool) -> Result<ArrayRef> {
Some(as_interval_mdn_array(&args[2])?),
);

let mut values = vec![];
let mut offsets = vec![0];
// values are date32s
let values_builder = Date32Builder::new();
let mut list_builder = ListBuilder::new(values_builder);

for (idx, stop) in stop_array.iter().enumerate() {
let mut stop = stop.unwrap_or(0);
let start = start_array.as_ref().map(|x| x.value(idx)).unwrap_or(0);
let step = step_array.as_ref().map(|arr| arr.value(idx)).unwrap_or(
IntervalMonthDayNano {
months: 0,
days: 0,
nanoseconds: 1,
},
);

let start = if let Some(start_array_values) = start_array {
start_array_values.value(idx)
} else {
list_builder.append_null();
continue;
};

let step = if let Some(step) = step_array {
step.value(idx)
} else {
list_builder.append_null();
continue;
};

let (months, days, _) = IntervalMonthDayNanoType::to_parts(step);

if months == 0 && days == 0 {
return exec_err!("Cannot generate date range less than 1 day.");
}

let neg = months < 0 || days < 0;
if !include_upper {
stop = Date32Type::subtract_month_day_nano(stop, step);
}
let mut new_date = start;
loop {
if neg && new_date < stop || !neg && new_date > stop {
break;

let values = from_fn(|| {
if (neg && new_date < stop) || (!neg && new_date > stop) {
None
} else {
let current_date = new_date;
new_date = Date32Type::add_month_day_nano(new_date, step);
Some(Some(current_date))
}
values.push(new_date);
new_date = Date32Type::add_month_day_nano(new_date, step);
}
offsets.push(values.len() as i32);
});

list_builder.append_value(values);
}

let arr = Arc::new(ListArray::try_new(
Arc::new(Field::new("item", Date32, true)),
OffsetBuffer::new(offsets.into()),
Arc::new(Date32Array::from(values)),
None,
)?);
let arr = Arc::new(list_builder.finish());

Ok(arr)
}
18 changes: 15 additions & 3 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1971,7 +1971,7 @@ select array_slice(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), co
# `from` may be larger than `to` and `stride` is positive
query ????
select array_slice(a, -1, 2, 1), array_slice(a, -1, 2),
array_slice(a, 3, 2, 1), array_slice(a, 3, 2)
array_slice(a, 3, 2, 1), array_slice(a, 3, 2)
from (values ([1.0, 2.0, 3.0, 3.0]), ([4.0, 5.0, 3.0]), ([6.0])) t(a);
----
[] [] [] []
Expand Down Expand Up @@ -5711,7 +5711,7 @@ select

# Test range for other edge cases
query ????????
select
select
range(9223372036854775807, 9223372036854775807, -1) as c1,
range(9223372036854775807, 9223372036854775806, -1) as c2,
range(9223372036854775807, 9223372036854775807, 1) as c3,
Expand Down Expand Up @@ -5787,6 +5787,9 @@ select range(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
----
[]

query error DataFusion error: Execution error: Cannot generate date range less than 1 day\.
select range(DATE '1993-03-01', DATE '1993-03-01', INTERVAL '1' HOUR)

query ?????????
select generate_series(5),
generate_series(2, 5),
Expand All @@ -5801,6 +5804,9 @@ select generate_series(5),
----
[0, 1, 2, 3, 4, 5] [2, 3, 4, 5] [2, 5, 8] [1, 2, 3, 4, 5] [5, 4, 3, 2, 1] [10, 7, 4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01, 1993-03-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02, 1993-01-01] [1989-04-01, 1990-04-01, 1991-04-01, 1992-04-01]

query error DataFusion error: Execution error: unsupported type for range. Expected Int64 or Date32, got: Timestamp\(Nanosecond, None\)
select generate_series('2021-01-01'::timestamp, '2021-01-02'::timestamp, INTERVAL '1' HOUR);

## should return NULL
query ?
select generate_series(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
Expand Down Expand Up @@ -5832,6 +5838,12 @@ select generate_series(DATE '1993-03-01', DATE '1989-04-01', INTERVAL '1' YEAR)
----
[]

query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '1' HOUR)

query error DataFusion error: Execution error: Cannot generate date range less than 1 day.
select generate_series(DATE '2000-01-01', DATE '2000-01-03', INTERVAL '-1' HOUR)

# Test generate_series with zero step
query error DataFusion error: Execution error: step can't be 0 for function generate_series\(start \[, stop, step\]\)
select generate_series(1, 1, 0);
Expand All @@ -5849,7 +5861,7 @@ select

# Test generate_series for other edge cases
query ????
select
select
generate_series(9223372036854775807, 9223372036854775807, -1) as c1,
generate_series(9223372036854775807, 9223372036854775807, 1) as c2,
generate_series(-9223372036854775808, -9223372036854775808, -1) as c3,
Expand Down

0 comments on commit 7d6cd13

Please sign in to comment.