diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f7acc896cb25..3b79e573c096 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -88,6 +88,9 @@ jobs: - name: Check function packages (array_expressions) run: cargo check --no-default-features --features=array_expressions -p datafusion + - name: Check function packages (datetime_expressions) + run: cargo check --no-default-features --features=datetime_expressions -p datafusion + - name: Check Cargo.lock for datafusion-cli run: | # If this test fails, try running `cargo update` in the `datafusion-cli` directory diff --git a/README.md b/README.md index 44e06e1b6a92..634aa426bdff 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ Default features: - `array_expressions`: functions for working with arrays such as `array_to_string` - `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd` - `crypto_expressions`: cryptographic functions such as `md5` and `sha256` +- `datetime_expressions`: date and time functions such as `to_timestamp` - `encoding_expressions`: `encode` and `decode` functions - `parquet`: support for reading the [Apache Parquet] format - `regex_expressions`: regular expression functions, such as `regexp_match` diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 916333f25a60..2379a30ce10f 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1231,11 +1231,14 @@ name = "datafusion-functions" version = "36.0.0" dependencies = [ "arrow", + "arrow-array", "base64", + "chrono", "datafusion-common", "datafusion-execution", "datafusion-expr", "hex", + "itertools", "log", ] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 147e4329509a..ad506762f0d0 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -38,6 +38,7 @@ clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "36.0.0", features = [ "avro", "crypto_expressions", + "datetime_expressions", "encoding_expressions", "parquet", "regex_expressions", diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 8079c3e14117..9739b44aafa0 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -15,8 +15,12 @@ // specific language governing permissions and limitations // under the License. +use std::collections::HashMap; +use std::sync::Arc; + use arrow::array::{BooleanArray, Int32Array}; use arrow::record_batch::RecordBatch; + use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::common::{DFField, DFSchema}; use datafusion::error::Result; @@ -30,8 +34,6 @@ use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::expr::BinaryExpr; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::{ColumnarValue, ExprSchemable, Operator}; -use std::collections::HashMap; -use std::sync::Arc; /// This example demonstrates the DataFusion [`Expr`] API. /// @@ -113,10 +115,7 @@ fn evaluate_demo() -> Result<()> { fn simplify_demo() -> Result<()> { // For example, lets say you have has created an expression such // ts = to_timestamp("2020-09-08T12:00:00+00:00") - let expr = col("ts").eq(call_fn( - "to_timestamp", - vec![lit("2020-09-08T12:00:00+00:00")], - )?); + let expr = col("ts").eq(to_timestamp(vec![lit("2020-09-08T12:00:00+00:00")])); // Naively evaluating such an expression against a large number of // rows would involve re-converting "2020-09-08T12:00:00+00:00" to a diff --git a/datafusion/common/src/format.rs b/datafusion/common/src/format.rs index d5421c36cd73..484a7f2388f5 100644 --- a/datafusion/common/src/format.rs +++ b/datafusion/common/src/format.rs @@ -15,8 +15,15 @@ // specific language governing permissions and limitations // under the License. +use arrow::compute::CastOptions; use arrow::util::display::{DurationFormat, FormatOptions}; /// The default [`FormatOptions`] to use within DataFusion pub const DEFAULT_FORMAT_OPTIONS: FormatOptions<'static> = FormatOptions::new().with_duration_format(DurationFormat::Pretty); + +/// The default [`CastOptions`] to use within DataFusion +pub const DEFAULT_CAST_OPTIONS: CastOptions<'static> = CastOptions { + safe: false, + format_options: DEFAULT_FORMAT_OPTIONS, +}; diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 019b4e58728c..662d95a9323c 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -43,9 +43,11 @@ avro = ["apache-avro", "num-traits", "datafusion-common/avro"] backtrace = ["datafusion-common/backtrace"] compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression", "tokio-util"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions", "datafusion-optimizer/crypto_expressions"] +datetime_expressions = ["datafusion-functions/datetime_expressions"] default = [ "array_expressions", "crypto_expressions", + "datetime_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", diff --git a/datafusion/core/tests/simplification.rs b/datafusion/core/tests/simplification.rs index b6d856b2d9a0..5fe64ca5bf04 100644 --- a/datafusion/core/tests/simplification.rs +++ b/datafusion/core/tests/simplification.rs @@ -18,10 +18,22 @@ //! This program demonstrates the DataFusion expression simplification API. use arrow::datatypes::{DataType, Field, Schema}; +use arrow_array::{ArrayRef, Int32Array}; +use chrono::{DateTime, TimeZone, Utc}; use datafusion::common::DFSchema; use datafusion::{error::Result, execution::context::ExecutionProps, prelude::*}; -use datafusion_expr::{Expr, ExprSchemable}; -use datafusion_optimizer::simplify_expressions::{ExprSimplifier, SimplifyInfo}; +use datafusion_common::cast::as_int32_array; +use datafusion_common::ScalarValue; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::{ + expr, table_scan, BuiltinScalarFunction, Cast, ColumnarValue, Expr, ExprSchemable, + LogicalPlan, LogicalPlanBuilder, ScalarUDF, Volatility, +}; +use datafusion_optimizer::simplify_expressions::{ + ExprSimplifier, SimplifyExpressions, SimplifyInfo, +}; +use datafusion_optimizer::{OptimizerContext, OptimizerRule}; +use std::sync::Arc; /// In order to simplify expressions, DataFusion must have information /// about the expressions. @@ -79,6 +91,110 @@ fn schema() -> DFSchema { .unwrap() } +fn test_table_scan() -> LogicalPlan { + let schema = Schema::new(vec![ + Field::new("a", DataType::Boolean, false), + Field::new("b", DataType::Boolean, false), + Field::new("c", DataType::Boolean, false), + Field::new("d", DataType::UInt32, false), + Field::new("e", DataType::UInt32, true), + ]); + table_scan(Some("test"), &schema, None) + .expect("creating scan") + .build() + .expect("building plan") +} + +fn get_optimized_plan_formatted(plan: &LogicalPlan, date_time: &DateTime) -> String { + let config = OptimizerContext::new().with_query_execution_start_time(*date_time); + let rule = SimplifyExpressions::new(); + + let optimized_plan = rule + .try_optimize(plan, &config) + .unwrap() + .expect("failed to optimize plan"); + format!("{optimized_plan:?}") +} + +// ------------------------------ +// --- ConstEvaluator tests ----- +// ------------------------------ +fn test_evaluate_with_start_time( + input_expr: Expr, + expected_expr: Expr, + date_time: &DateTime, +) { + let execution_props = + ExecutionProps::new().with_query_execution_start_time(*date_time); + + let info: MyInfo = MyInfo { + schema: schema(), + execution_props, + }; + let simplifier = ExprSimplifier::new(info); + let simplified_expr = simplifier + .simplify(input_expr.clone()) + .expect("successfully evaluated"); + + assert_eq!( + simplified_expr, expected_expr, + "Mismatch evaluating {input_expr}\n Expected:{expected_expr}\n Got:{simplified_expr}" + ); +} + +fn test_evaluate(input_expr: Expr, expected_expr: Expr) { + test_evaluate_with_start_time(input_expr, expected_expr, &Utc::now()) +} + +// Make a UDF that adds its two values together, with the specified volatility +fn make_udf_add(volatility: Volatility) -> Arc { + let input_types = vec![DataType::Int32, DataType::Int32]; + let return_type = Arc::new(DataType::Int32); + + let fun = Arc::new(|args: &[ColumnarValue]| { + let args = ColumnarValue::values_to_arrays(args)?; + + let arg0 = as_int32_array(&args[0])?; + let arg1 = as_int32_array(&args[1])?; + + // 2. perform the computation + let array = arg0 + .iter() + .zip(arg1.iter()) + .map(|args| { + if let (Some(arg0), Some(arg1)) = args { + Some(arg0 + arg1) + } else { + // one or both args were Null + None + } + }) + .collect::(); + + Ok(ColumnarValue::from(Arc::new(array) as ArrayRef)) + }); + + Arc::new(create_udf( + "udf_add", + input_types, + return_type, + volatility, + fun, + )) +} + +fn now_expr() -> Expr { + call_fn("now", vec![]).unwrap() +} + +fn cast_to_int64_expr(expr: Expr) -> Expr { + Expr::Cast(Cast::new(expr.into(), DataType::Int64)) +} + +fn to_timestamp_expr(arg: impl Into) -> Expr { + to_timestamp(vec![lit(arg.into())]) +} + #[test] fn basic() { let info: MyInfo = schema().into(); @@ -108,3 +224,190 @@ fn fold_and_simplify() { let simplified = simplifier.simplify(expr).unwrap(); assert_eq!(simplified, lit(true)) } + +#[test] +/// Ensure that timestamp expressions are folded so they aren't invoked on each row +fn to_timestamp_expr_folded() -> Result<()> { + let table_scan = test_table_scan(); + let proj = vec![to_timestamp_expr("2020-09-08T12:00:00+00:00")]; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(proj)? + .build()?; + + let expected = "Projection: TimestampNanosecond(1599566400000000000, None) AS to_timestamp(Utf8(\"2020-09-08T12:00:00+00:00\"))\ + \n TableScan: test" + .to_string(); + let actual = get_optimized_plan_formatted(&plan, &Utc::now()); + assert_eq!(expected, actual); + Ok(()) +} + +#[test] +fn now_less_than_timestamp() -> Result<()> { + let table_scan = test_table_scan(); + + let ts_string = "2020-09-08T12:05:00+00:00"; + let time = Utc.timestamp_nanos(1599566400000000000i64); + + // cast(now() as int) < cast(to_timestamp(...) as int) + 50000_i64 + let plan = LogicalPlanBuilder::from(table_scan) + .filter( + cast_to_int64_expr(now_expr()) + .lt(cast_to_int64_expr(to_timestamp_expr(ts_string)) + lit(50000_i64)), + )? + .build()?; + + // Note that constant folder runs and folds the entire + // expression down to a single constant (true) + let expected = "Filter: Boolean(true)\ + \n TableScan: test"; + let actual = get_optimized_plan_formatted(&plan, &time); + + assert_eq!(expected, actual); + Ok(()) +} + +#[test] +fn select_date_plus_interval() -> Result<()> { + let table_scan = test_table_scan(); + + let ts_string = "2020-09-08T12:05:00+00:00"; + let time = Utc.timestamp_nanos(1599566400000000000i64); + + // now() < cast(to_timestamp(...) as int) + 5000000000 + let schema = table_scan.schema(); + + let date_plus_interval_expr = to_timestamp_expr(ts_string) + .cast_to(&DataType::Date32, schema)? + + Expr::Literal(ScalarValue::IntervalDayTime(Some(123i64 << 32))); + + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .project(vec![date_plus_interval_expr])? + .build()?; + + // Note that constant folder runs and folds the entire + // expression down to a single constant (true) + let expected = r#"Projection: Date32("18636") AS to_timestamp(Utf8("2020-09-08T12:05:00+00:00")) + IntervalDayTime("528280977408") + TableScan: test"#; + let actual = get_optimized_plan_formatted(&plan, &time); + + assert_eq!(expected, actual); + Ok(()) +} + +#[test] +fn test_const_evaluator() { + // true --> true + test_evaluate(lit(true), lit(true)); + // true or true --> true + test_evaluate(lit(true).or(lit(true)), lit(true)); + // true or false --> true + test_evaluate(lit(true).or(lit(false)), lit(true)); + + // "foo" == "foo" --> true + test_evaluate(lit("foo").eq(lit("foo")), lit(true)); + // "foo" != "foo" --> false + test_evaluate(lit("foo").not_eq(lit("foo")), lit(false)); + + // c = 1 --> c = 1 + test_evaluate(col("c").eq(lit(1)), col("c").eq(lit(1))); + // c = 1 + 2 --> c + 3 + test_evaluate(col("c").eq(lit(1) + lit(2)), col("c").eq(lit(3))); + // (foo != foo) OR (c = 1) --> false OR (c = 1) + test_evaluate( + (lit("foo").not_eq(lit("foo"))).or(col("c").eq(lit(1))), + col("c").eq(lit(1)), + ); +} + +#[test] +fn test_const_evaluator_scalar_functions() { + // concat("foo", "bar") --> "foobar" + let expr = call_fn("concat", vec![lit("foo"), lit("bar")]).unwrap(); + test_evaluate(expr, lit("foobar")); + + // ensure arguments are also constant folded + // concat("foo", concat("bar", "baz")) --> "foobarbaz" + let concat1 = call_fn("concat", vec![lit("bar"), lit("baz")]).unwrap(); + let expr = call_fn("concat", vec![lit("foo"), concat1]).unwrap(); + test_evaluate(expr, lit("foobarbaz")); + + // Check non string arguments + // to_timestamp("2020-09-08T12:00:00+00:00") --> timestamp(1599566400i64) + let expr = to_timestamp(vec![lit("2020-09-08T12:00:00+00:00")]); + test_evaluate(expr, lit_timestamp_nano(1599566400000000000i64)); + + // check that non foldable arguments are folded + // to_timestamp(a) --> to_timestamp(a) [no rewrite possible] + let expr = to_timestamp(vec![col("a")]); + test_evaluate(expr.clone(), expr); + + // volatile / stable functions should not be evaluated + // rand() + (1 + 2) --> rand() + 3 + let fun = BuiltinScalarFunction::Random; + assert_eq!(fun.volatility(), Volatility::Volatile); + let rand = Expr::ScalarFunction(ScalarFunction::new(fun, vec![])); + let expr = rand.clone() + (lit(1) + lit(2)); + let expected = rand + lit(3); + test_evaluate(expr, expected); + + // parenthesization matters: can't rewrite + // (rand() + 1) + 2 --> (rand() + 1) + 2) + let fun = BuiltinScalarFunction::Random; + let rand = Expr::ScalarFunction(ScalarFunction::new(fun, vec![])); + let expr = (rand + lit(1)) + lit(2); + test_evaluate(expr.clone(), expr); +} + +#[test] +fn test_const_evaluator_now() { + let ts_nanos = 1599566400000000000i64; + let time = chrono::Utc.timestamp_nanos(ts_nanos); + let ts_string = "2020-09-08T12:05:00+00:00"; + // now() --> ts + test_evaluate_with_start_time(now_expr(), lit_timestamp_nano(ts_nanos), &time); + + // CAST(now() as int64) + 100_i64 --> ts + 100_i64 + let expr = cast_to_int64_expr(now_expr()) + lit(100_i64); + test_evaluate_with_start_time(expr, lit(ts_nanos + 100), &time); + + // CAST(now() as int64) < cast(to_timestamp(...) as int64) + 50000_i64 ---> true + let expr = cast_to_int64_expr(now_expr()) + .lt(cast_to_int64_expr(to_timestamp_expr(ts_string)) + lit(50000i64)); + test_evaluate_with_start_time(expr, lit(true), &time); +} + +#[test] +fn test_evaluator_udfs() { + let args = vec![lit(1) + lit(2), lit(30) + lit(40)]; + let folded_args = vec![lit(3), lit(70)]; + + // immutable UDF should get folded + // udf_add(1+2, 30+40) --> 73 + let expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( + make_udf_add(Volatility::Immutable), + args.clone(), + )); + test_evaluate(expr, lit(73)); + + // stable UDF should be entirely folded + // udf_add(1+2, 30+40) --> 73 + let fun = make_udf_add(Volatility::Stable); + let expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( + Arc::clone(&fun), + args.clone(), + )); + test_evaluate(expr, lit(73)); + + // volatile UDF should have args folded + // udf_add(1+2, 30+40) --> udf_add(3, 70) + let fun = make_udf_add(Volatility::Volatile); + let expr = + Expr::ScalarFunction(expr::ScalarFunction::new_udf(Arc::clone(&fun), args)); + let expected_expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( + Arc::clone(&fun), + folded_args, + )); + test_evaluate(expr, expected_expr); +} diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index d10045ff9f60..91e3acd0f7bb 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -263,20 +263,8 @@ pub enum BuiltinScalarFunction { Substr, /// to_hex ToHex, - /// to_timestamp - ToTimestamp, - /// to_timestamp_millis - ToTimestampMillis, - /// to_timestamp_micros - ToTimestampMicros, - /// to_timestamp_nanos - ToTimestampNanos, - /// to_timestamp_seconds - ToTimestampSeconds, /// from_unixtime FromUnixtime, - /// to_date - ToDate, ///now Now, ///current_date @@ -463,18 +451,12 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::ToHex => Volatility::Immutable, BuiltinScalarFunction::ToChar => Volatility::Immutable, - BuiltinScalarFunction::ToTimestamp => Volatility::Immutable, - BuiltinScalarFunction::ToTimestampMillis => Volatility::Immutable, - BuiltinScalarFunction::ToTimestampMicros => Volatility::Immutable, - BuiltinScalarFunction::ToTimestampNanos => Volatility::Immutable, - BuiltinScalarFunction::ToTimestampSeconds => Volatility::Immutable, BuiltinScalarFunction::MakeDate => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, BuiltinScalarFunction::Trim => Volatility::Immutable, BuiltinScalarFunction::Upper => Volatility::Immutable, BuiltinScalarFunction::Struct => Volatility::Immutable, BuiltinScalarFunction::FromUnixtime => Volatility::Immutable, - BuiltinScalarFunction::ToDate => Volatility::Immutable, BuiltinScalarFunction::ArrowTypeof => Volatility::Immutable, BuiltinScalarFunction::OverLay => Volatility::Immutable, BuiltinScalarFunction::Levenshtein => Volatility::Immutable, @@ -568,7 +550,7 @@ impl BuiltinScalarFunction { _ => { return plan_err!( "The {self} function can only accept list as the args." - ) + ); } } } @@ -778,13 +760,7 @@ impl BuiltinScalarFunction { utf8_to_int_type(&input_expr_types[0], "find_in_set") } BuiltinScalarFunction::ToChar => Ok(Utf8), - BuiltinScalarFunction::ToTimestamp - | BuiltinScalarFunction::ToTimestampNanos => Ok(Timestamp(Nanosecond, None)), - BuiltinScalarFunction::ToTimestampMillis => Ok(Timestamp(Millisecond, None)), - BuiltinScalarFunction::ToTimestampMicros => Ok(Timestamp(Microsecond, None)), - BuiltinScalarFunction::ToTimestampSeconds => Ok(Timestamp(Second, None)), BuiltinScalarFunction::FromUnixtime => Ok(Timestamp(Second, None)), - BuiltinScalarFunction::ToDate => Ok(Date32), BuiltinScalarFunction::Now => { Ok(Timestamp(Nanosecond, Some("+00:00".into()))) } @@ -1052,17 +1028,9 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - BuiltinScalarFunction::ToTimestamp - | BuiltinScalarFunction::ToTimestampSeconds - | BuiltinScalarFunction::ToTimestampMillis - | BuiltinScalarFunction::ToTimestampMicros - | BuiltinScalarFunction::ToTimestampNanos => { - Signature::variadic_any(self.volatility()) - } BuiltinScalarFunction::FromUnixtime => { Signature::uniform(1, vec![Int64], self.volatility()) } - BuiltinScalarFunction::ToDate => Signature::variadic_any(self.volatility()), BuiltinScalarFunction::Digest => Signature::one_of( vec![ Exact(vec![Utf8, Utf8]), @@ -1489,13 +1457,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::DateTrunc => &["date_trunc", "datetrunc"], BuiltinScalarFunction::DatePart => &["date_part", "datepart"], BuiltinScalarFunction::ToChar => &["to_char", "date_format"], - BuiltinScalarFunction::ToTimestamp => &["to_timestamp"], - BuiltinScalarFunction::ToTimestampMillis => &["to_timestamp_millis"], - BuiltinScalarFunction::ToTimestampMicros => &["to_timestamp_micros"], - BuiltinScalarFunction::ToTimestampSeconds => &["to_timestamp_seconds"], - BuiltinScalarFunction::ToTimestampNanos => &["to_timestamp_nanos"], BuiltinScalarFunction::FromUnixtime => &["from_unixtime"], - BuiltinScalarFunction::ToDate => &["to_date"], // hashing functions BuiltinScalarFunction::Digest => &["digest"], diff --git a/datafusion/expr/src/columnar_value.rs b/datafusion/expr/src/columnar_value.rs index c845c81cb708..831edc078d6a 100644 --- a/datafusion/expr/src/columnar_value.rs +++ b/datafusion/expr/src/columnar_value.rs @@ -19,7 +19,9 @@ use arrow::array::ArrayRef; use arrow::array::NullArray; -use arrow::datatypes::DataType; +use arrow::compute::{kernels, CastOptions}; +use arrow::datatypes::{DataType, TimeUnit}; +use datafusion_common::format::DEFAULT_CAST_OPTIONS; use datafusion_common::{internal_err, Result, ScalarValue}; use std::sync::Arc; @@ -122,6 +124,42 @@ impl ColumnarValue { Ok(args) } + + /// Cast's this [ColumnarValue] to the specified `DataType` + pub fn cast_to( + &self, + cast_type: &DataType, + cast_options: Option<&CastOptions<'static>>, + ) -> Result { + let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS); + match self { + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + kernels::cast::cast_with_options(array, cast_type, &cast_options)?, + )), + ColumnarValue::Scalar(scalar) => { + let scalar_array = + if cast_type == &DataType::Timestamp(TimeUnit::Nanosecond, None) { + if let ScalarValue::Float64(Some(float_ts)) = scalar { + ScalarValue::Int64(Some( + (float_ts * 1_000_000_000_f64).trunc() as i64, + )) + .to_array()? + } else { + scalar.to_array()? + } + } else { + scalar.to_array()? + }; + let cast_array = kernels::cast::cast_with_options( + &scalar_array, + cast_type, + &cast_options, + )?; + let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; + Ok(ColumnarValue::Scalar(cast_scalar)) + } + } + } } #[cfg(test)] diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index cb68cae1161d..157b8b0989df 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -881,36 +881,6 @@ scalar_expr!( datetime format, "converts a date, time, timestamp or duration to a string based on the provided format" ); -nary_scalar_expr!( - ToDate, - to_date, - "converts string to date according to the given format" -); -nary_scalar_expr!( - ToTimestamp, - to_timestamp, - "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`" -); -nary_scalar_expr!( - ToTimestampMillis, - to_timestamp_millis, - "converts a string and optional formats to a `Timestamp(Milliseconds, None)`" -); -nary_scalar_expr!( - ToTimestampMicros, - to_timestamp_micros, - "converts a string and optional formats to a `Timestamp(Microseconds, None)`" -); -nary_scalar_expr!( - ToTimestampNanos, - to_timestamp_nanos, - "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`" -); -nary_scalar_expr!( - ToTimestampSeconds, - to_timestamp_seconds, - "converts a string and optional formats to a `Timestamp(Seconds, None)`" -); scalar_expr!( FromUnixtime, from_unixtime, diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index c7edb52fc73e..e890c9623ca3 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -31,8 +31,10 @@ rust-version = { workspace = true } [features] # enable core functions core_expressions = [] +# enable datetime functions +datetime_expressions = [] # Enable encoding by default so the doctests work. In general don't automatically enable all packages. -default = ["core_expressions", "encoding_expressions", "math_expressions", "regex_expressions"] +default = ["core_expressions", "datetime_expressions", "encoding_expressions", "math_expressions", "regex_expressions"] # enable encode/decode functions encoding_expressions = ["base64", "hex"] # enable math functions @@ -47,9 +49,22 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-array = { workspace = true } base64 = { version = "0.21", optional = true } -datafusion-common = { workspace = true, default-features = true } +chrono = { workspace = true } +datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } hex = { version = "0.4", optional = true } +itertools = { workspace = true } log = { workspace = true } + +[dev-dependencies] +criterion = "0.5" +rand = { workspace = true } +rstest = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt", "sync"] } + +[[bench]] +harness = false +name = "to_timestamp" diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs new file mode 100644 index 000000000000..c83824526442 --- /dev/null +++ b/datafusion/functions/benches/to_timestamp.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use datafusion_expr::lit; +use datafusion_functions::expr_fn::to_timestamp; + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("to_timestamp_no_formats", |b| { + let inputs = vec![ + lit("1997-01-31T09:26:56.123Z"), + lit("1997-01-31T09:26:56.123-05:00"), + lit("1997-01-31 09:26:56.123-05:00"), + lit("2023-01-01 04:05:06.789 -08"), + lit("1997-01-31T09:26:56.123"), + lit("1997-01-31 09:26:56.123"), + lit("1997-01-31 09:26:56"), + lit("1997-01-31 13:26:56"), + lit("1997-01-31 13:26:56+04:00"), + lit("1997-01-31"), + ]; + b.iter(|| { + for i in inputs.iter() { + black_box(to_timestamp(vec![i.clone()])); + } + }); + }); + + c.bench_function("to_timestamp_with_formats", |b| { + let mut inputs = vec![]; + let mut format1 = vec![]; + let mut format2 = vec![]; + let mut format3 = vec![]; + + inputs.push(lit("1997-01-31T09:26:56.123Z")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%Z")); + + inputs.push(lit("1997-01-31T09:26:56.123-05:00")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%z")); + + inputs.push(lit("1997-01-31 09:26:56.123-05:00")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H:%M:%S%.f%Z")); + + inputs.push(lit("2023-01-01 04:05:06.789 -08")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H:%M:%S%.f %#z")); + + inputs.push(lit("1997-01-31T09:26:56.123")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%dT%H:%M:%S%.f")); + + inputs.push(lit("1997-01-31 09:26:56.123")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H:%M:%S%.f")); + + inputs.push(lit("1997-01-31 09:26:56")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H:%M:%S")); + + inputs.push(lit("1997-01-31 092656")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H%M%S")); + + inputs.push(lit("1997-01-31 092656+04:00")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d %H%M%S%:z")); + + inputs.push(lit("Sun Jul 8 00:34:60 2001")); + format1.push(lit("%+")); + format2.push(lit("%c")); + format3.push(lit("%Y-%m-%d 00:00:00")); + + b.iter(|| { + inputs.iter().enumerate().for_each(|(idx, i)| { + black_box(to_timestamp(vec![ + i.clone(), + format1.get(idx).unwrap().clone(), + format2.get(idx).unwrap().clone(), + format3.get(idx).unwrap().clone(), + ])); + }) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs new file mode 100644 index 000000000000..829f13b5a0a6 --- /dev/null +++ b/datafusion/functions/src/datetime/common.rs @@ -0,0 +1,389 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; +use arrow::datatypes::DataType; +use arrow_array::{ + Array, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use chrono::LocalResult::Single; +use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use itertools::Either; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, DataFusionError, Result, ScalarType, ScalarValue}; +use datafusion_expr::ColumnarValue; + +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + +/// Calls string_to_timestamp_nanos and converts the error type +pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result { + string_to_timestamp_nanos(s).map_err(|e| e.into()) +} + +pub(crate) fn validate_data_types( + args: &[ColumnarValue], + name: &str, +) -> Option> { + for (idx, a) in args.iter().skip(1).enumerate() { + match a.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + } + _ => { + return Some(exec_err!( + "{name} function unsupported data type at index {}: {}", + idx + 1, + a.data_type() + )); + } + } + } + + None +} + +/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers +/// relative to the provided `timezone` +/// +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html +/// [IANA timezones]: https://www.iana.org/time-zones +pub(crate) fn string_to_datetime_formatted( + timezone: &T, + s: &str, + format: &str, +) -> Result, DataFusionError> { + let err = |err_ctx: &str| { + DataFusionError::Execution(format!( + "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}" + )) + }; + + // attempt to parse the string assuming it has a timezone + let dt = DateTime::parse_from_str(s, format); + + if let Err(e) = &dt { + // no timezone or other failure, try without a timezone + let ndt = NaiveDateTime::parse_from_str(s, format); + if let Err(e) = &ndt { + return Err(err(&e.to_string())); + } + + if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) { + Ok(e.to_owned()) + } else { + Err(err(&e.to_string())) + } + } else { + Ok(dt.unwrap().with_timezone(timezone)) + } +} + +/// Accepts a string with a `chrono` format and converts it to a +/// nanosecond precision timestamp. +/// +/// See [`chrono::format::strftime`] for the full set of supported formats. +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// Any timestamp in the formatting string is handled according to the rules +/// defined by `chrono`. +/// +/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html +/// +#[inline] +pub(crate) fn string_to_timestamp_nanos_formatted( + s: &str, + format: &str, +) -> Result { + string_to_datetime_formatted(&Utc, s, format)? + .naive_utc() + .timestamp_nanos_opt() + .ok_or_else(|| { + DataFusionError::Execution(ERR_NANOSECONDS_NOT_SUPPORTED.to_string()) + }) +} + +pub(crate) fn handle<'a, O, F, S>( + args: &'a [ColumnarValue], + op: F, + name: &str, +) -> Result +where + O: ArrowPrimitiveType, + S: ScalarType, + F: Fn(&'a str) -> Result, +{ + match &args[0] { + ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( + unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + ))), + other => exec_err!("Unsupported data type {other:?} for function {name}"), + }, + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { + let result = a.as_ref().map(|x| (op)(x)).transpose()?; + Ok(ColumnarValue::Scalar(S::scalar(result))) + } + other => exec_err!("Unsupported data type {other:?} for function {name}"), + }, + } +} + +// given an function that maps a `&str`, `&str` to an arrow native type, +// returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` +// depending on the `args`'s variant. +pub(crate) fn handle_multiple<'a, O, F, S, M>( + args: &'a [ColumnarValue], + op: F, + op2: M, + name: &str, +) -> Result +where + O: ArrowPrimitiveType, + S: ScalarType, + F: Fn(&'a str, &'a str) -> Result, + M: Fn(O::Native) -> O::Native, +{ + match &args[0] { + ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // validate the column types + for (pos, arg) in args.iter().enumerate() { + match arg { + ColumnarValue::Array(arg) => match arg.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + } + other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), + }, + ColumnarValue::Scalar(arg) => { + match arg.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + } + other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), + } + } + } + } + + Ok(ColumnarValue::Array(Arc::new( + strings_to_primitive_function::(args, op, op2, name)?, + ))) + } + other => { + exec_err!("Unsupported data type {other:?} for function {name}") + } + }, + // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { + let mut val: Option> = None; + let mut err: Option = None; + + match a { + Some(a) => { + // enumerate all the values finding the first one that returns an Ok result + for (pos, v) in args.iter().enumerate().skip(1) { + if let ColumnarValue::Scalar(s) = v { + if let ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x) = + s + { + if let Some(s) = x { + match op(a.as_str(), s.as_str()) { + Ok(r) => { + val = Some(Ok(ColumnarValue::Scalar( + S::scalar(Some(op2(r))), + ))); + break; + } + Err(e) => { + err = Some(e); + } + } + } + } else { + return exec_err!("Unsupported data type {s:?} for function {name}, arg # {pos}"); + } + } else { + return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); + } + } + } + None => (), + } + + if let Some(v) = val { + v + } else { + Err(err.unwrap()) + } + } + other => { + exec_err!("Unsupported data type {other:?} for function {name}") + } + }, + } +} + +/// given a function `op` that maps `&str`, `&str` to the first successful Result +/// of an arrow native type, returns a `PrimitiveArray` after the application of the +/// function to `args` and the subsequence application of the `op2` function to any +/// successful result. This function calls the `op` function with the first and second +/// argument and if not successful continues with first and third, first and fourth, +/// etc until the result was successful or no more arguments are present. +/// # Errors +/// This function errors iff: +/// * the number of arguments is not > 1 or +/// * the array arguments are not castable to a `GenericStringArray` or +/// * the function `op` errors for all input +pub(crate) fn strings_to_primitive_function<'a, T, O, F, F2>( + args: &'a [ColumnarValue], + op: F, + op2: F2, + name: &str, +) -> Result> +where + O: ArrowPrimitiveType, + T: OffsetSizeTrait, + F: Fn(&'a str, &'a str) -> Result, + F2: Fn(O::Native) -> O::Native, +{ + if args.len() < 2 { + return exec_err!( + "{:?} args were supplied but {} takes 2 or more arguments", + args.len(), + name + ); + } + + // this will throw the error if any of the array args are not castable to GenericStringArray + let data = args + .iter() + .map(|a| match a { + ColumnarValue::Array(a) => { + Ok(Either::Left(as_generic_string_array::(a.as_ref())?)) + } + ColumnarValue::Scalar(s) => match s { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => Ok(Either::Right(a)), + other => exec_err!( + "Unexpected scalar type encountered '{other}' for function '{name}'" + ), + }, + }) + .collect::, &Option>>>>()?; + + let first_arg = &data.first().unwrap().left().unwrap(); + + first_arg + .iter() + .enumerate() + .map(|(pos, x)| { + let mut val = None; + + if let Some(x) = x { + let param_args = data.iter().skip(1); + + // go through the args and find the first successful result. Only the last + // failure will be returned if no successful result was received. + for param_arg in param_args { + // param_arg is an array, use the corresponding index into the array as the arg + // we're currently parsing + let p = *param_arg; + let r = if p.is_left() { + let p = p.left().unwrap(); + op(x, p.value(pos)) + } + // args is a scalar, use it directly + else if let Some(p) = p.right().unwrap() { + op(x, p.as_str()) + } else { + continue; + }; + + if r.is_ok() { + val = Some(Ok(op2(r.unwrap()))); + break; + } else { + val = Some(r); + } + } + }; + + val.transpose() + }) + .collect() +} + +/// given a function `op` that maps a `&str` to a Result of an arrow native type, +/// returns a `PrimitiveArray` after the application +/// of the function to `args[0]`. +/// # Errors +/// This function errors iff: +/// * the number of arguments is not 1 or +/// * the first argument is not castable to a `GenericStringArray` or +/// * the function `op` errors +fn unary_string_to_primitive_function<'a, T, O, F>( + args: &[&'a dyn Array], + op: F, + name: &str, +) -> Result> +where + O: ArrowPrimitiveType, + T: OffsetSizeTrait, + F: Fn(&'a str) -> Result, +{ + if args.len() != 1 { + return exec_err!( + "{:?} args were supplied but {} takes exactly one argument", + args.len(), + name + ); + } + + let array = as_generic_string_array::(args[0])?; + + // first map is the iterator, second is for the `Option<_>` + array.iter().map(|x| x.map(&op).transpose()).collect() +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs new file mode 100644 index 000000000000..233e8b2cdbb4 --- /dev/null +++ b/datafusion/functions/src/datetime/mod.rs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! date & time DataFusion functions + +use std::sync::Arc; + +use datafusion_expr::ScalarUDF; + +mod common; +mod to_date; +mod to_timestamp; + +// create UDFs +make_udf_function!(to_date::ToDateFunc, TO_DATE, to_date); +make_udf_function!(to_timestamp::ToTimestampFunc, TO_TIMESTAMP, to_timestamp); +make_udf_function!( + to_timestamp::ToTimestampSecondsFunc, + TO_TIMESTAMP_SECONDS, + to_timestamp_seconds +); +make_udf_function!( + to_timestamp::ToTimestampMillisFunc, + TO_TIMESTAMP_MILLIS, + to_timestamp_millis +); +make_udf_function!( + to_timestamp::ToTimestampMicrosFunc, + TO_TIMESTAMP_MICROS, + to_timestamp_micros +); +make_udf_function!( + to_timestamp::ToTimestampNanosFunc, + TO_TIMESTAMP_NANOS, + to_timestamp_nanos +); + +// we cannot currently use the export_functions macro since it doesn't handle +// functions with varargs currently + +pub mod expr_fn { + use datafusion_expr::Expr; + + /// ```ignore + /// # use std::sync::Arc; + /// + /// # use datafusion_common::Result; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// # use arrow::array::StringArray; + /// # use arrow::datatypes::{DataType, Field, Schema}; + /// # use arrow::record_batch::RecordBatch; + /// # use datafusion_expr::col; + /// # use datafusion::prelude::*; + /// # use datafusion_functions::expr_fn::to_date; + /// + /// // define a schema. + /// let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + /// + /// // define data. + /// let batch = RecordBatch::try_new( + /// schema, + /// vec![Arc::new(StringArray::from(vec![ + /// "2020-09-08T13:42:29Z", + /// "2020-09-08T13:42:29.190855-05:00", + /// "2020-08-09 12:13:29", + /// "2020-01-02", + /// ]))], + /// )?; + /// + /// // declare a new context. In spark API, this corresponds to a new spark SQLsession + /// let ctx = SessionContext::new(); + /// + /// // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + /// ctx.register_batch("t", batch)?; + /// let df = ctx.table("t").await?; + /// + /// // use to_date function to convert col 'a' to timestamp type using the default parsing + /// let df = df.with_column("a", to_date(vec![col("a")]))?; + /// + /// let df = df.select_columns(&["a"])?; + /// + /// // print the results + /// df.show().await?; + /// + /// # Ok(()) + /// # } + /// ``` + pub fn to_date(args: Vec) -> Expr { + super::to_date().call(args) + } + + #[doc = "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`"] + pub fn to_timestamp(args: Vec) -> Expr { + super::to_timestamp().call(args) + } + + #[doc = "converts a string and optional formats to a `Timestamp(Seconds, None)`"] + pub fn to_timestamp_seconds(args: Vec) -> Expr { + super::to_timestamp_seconds().call(args) + } + + #[doc = "converts a string and optional formats to a `Timestamp(Milliseconds, None)`"] + pub fn to_timestamp_millis(args: Vec) -> Expr { + super::to_timestamp_millis().call(args) + } + + #[doc = "converts a string and optional formats to a `Timestamp(Microseconds, None)`"] + pub fn to_timestamp_micros(args: Vec) -> Expr { + super::to_timestamp_micros().call(args) + } + + #[doc = "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`"] + pub fn to_timestamp_nanos(args: Vec) -> Expr { + super::to_timestamp_nanos().call(args) + } +} + +/// Return a list of all functions in this package +pub fn functions() -> Vec> { + vec![ + to_date(), + to_timestamp(), + to_timestamp_seconds(), + to_timestamp_millis(), + to_timestamp_micros(), + to_timestamp_nanos(), + ] +} diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs new file mode 100644 index 000000000000..e1b842680ca4 --- /dev/null +++ b/datafusion/functions/src/datetime/to_date.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::Date32; +use arrow_array::types::Date32Type; + +use crate::datetime::common::*; +use datafusion_common::{exec_err, internal_datafusion_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct ToDateFunc { + signature: Signature, +} + +impl ToDateFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } + + fn to_date(&self, args: &[ColumnarValue]) -> Result { + match args.len() { + 1 => handle::( + args, + |s| { + string_to_timestamp_nanos_shim(s) + .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) + .and_then(|v| { + v.try_into().map_err(|_| { + internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") + }) + }) + }, + "to_date", + ), + n if n >= 2 => handle_multiple::( + args, + |s, format| { + string_to_timestamp_nanos_formatted(s, format) + .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) + .and_then(|v| { + v.try_into().map_err(|_| { + internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") + }) + }) + }, + |n| n, + "to_date", + ), + _ => exec_err!("Unsupported 0 argument count for function to_date"), + } + } +} + +impl ScalarUDFImpl for ToDateFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_date" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Date32) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!("to_date function requires 1 or more arguments, got 0"); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_date") { + return value; + } + } + + match args[0].data_type() { + DataType::Int32 + | DataType::Int64 + | DataType::Null + | DataType::Float64 + | DataType::Date32 + | DataType::Date64 => args[0].cast_to(&DataType::Date32, None), + DataType::Utf8 => self.to_date(args), + other => { + exec_err!("Unsupported data type {:?} for function to_date", other) + } + } + } +} diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs new file mode 100644 index 000000000000..adba84af72ae --- /dev/null +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -0,0 +1,811 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType::Timestamp; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use arrow::datatypes::{ + ArrowTimestampType, DataType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, +}; + +use datafusion_common::{exec_err, Result, ScalarType}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::datetime::common::*; + +#[derive(Debug)] +pub(super) struct ToTimestampFunc { + signature: Signature, +} + +#[derive(Debug)] +pub(super) struct ToTimestampSecondsFunc { + signature: Signature, +} + +#[derive(Debug)] +pub(super) struct ToTimestampMillisFunc { + signature: Signature, +} + +#[derive(Debug)] +pub(super) struct ToTimestampMicrosFunc { + signature: Signature, +} + +#[derive(Debug)] +pub(super) struct ToTimestampNanosFunc { + signature: Signature, +} + +impl ToTimestampFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl ToTimestampSecondsFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl ToTimestampMillisFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl ToTimestampMicrosFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +impl ToTimestampNanosFunc { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + } + } +} + +/// to_timestamp SQL function +/// +/// Note: `to_timestamp` returns `Timestamp(Nanosecond)` though its arguments are interpreted as **seconds**. +/// The supported range for integer input is between `-9223372037` and `9223372036`. +/// Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. +/// Please use `to_timestamp_seconds` for the input outside of supported bounds. +impl ScalarUDFImpl for ToTimestampFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_timestamp" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Nanosecond, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_timestamp function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_timestamp") { + return value; + } + } + + match args[0].data_type() { + DataType::Int32 | DataType::Int64 => args[0] + .cast_to(&Timestamp(Second, None), None)? + .cast_to(&Timestamp(Nanosecond, None), None), + DataType::Null | DataType::Float64 | Timestamp(_, None) => { + args[0].cast_to(&Timestamp(Nanosecond, None), None) + } + DataType::Utf8 => { + to_timestamp_impl::(args, "to_timestamp") + } + other => { + exec_err!( + "Unsupported data type {:?} for function to_timestamp", + other + ) + } + } + } +} + +impl ScalarUDFImpl for ToTimestampSecondsFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_timestamp_seconds" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Second, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_timestamp_seconds function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_timestamp_seconds") { + return value; + } + } + + match args[0].data_type() { + DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + args[0].cast_to(&Timestamp(Second, None), None) + } + DataType::Utf8 => { + to_timestamp_impl::(args, "to_timestamp_seconds") + } + other => { + exec_err!( + "Unsupported data type {:?} for function to_timestamp_seconds", + other + ) + } + } + } +} + +impl ScalarUDFImpl for ToTimestampMillisFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_timestamp_millis" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Millisecond, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_timestamp_millis function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_timestamp_millis") { + return value; + } + } + + match args[0].data_type() { + DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + args[0].cast_to(&Timestamp(Millisecond, None), None) + } + DataType::Utf8 => { + to_timestamp_impl::(args, "to_timestamp_millis") + } + other => { + exec_err!( + "Unsupported data type {:?} for function to_timestamp_millis", + other + ) + } + } + } +} + +impl ScalarUDFImpl for ToTimestampMicrosFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_timestamp_micros" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Microsecond, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_timestamp_micros function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_timestamp_micros") { + return value; + } + } + + match args[0].data_type() { + DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + args[0].cast_to(&Timestamp(Microsecond, None), None) + } + DataType::Utf8 => { + to_timestamp_impl::(args, "to_timestamp_micros") + } + other => { + exec_err!( + "Unsupported data type {:?} for function to_timestamp_micros", + other + ) + } + } + } +} + +impl ScalarUDFImpl for ToTimestampNanosFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_timestamp_nanos" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Nanosecond, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.is_empty() { + return exec_err!( + "to_timestamp_nanos function requires 1 or more arguments, got {}", + args.len() + ); + } + + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_data_types(args, "to_timestamp_nanos") { + return value; + } + } + + match args[0].data_type() { + DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + args[0].cast_to(&Timestamp(Nanosecond, None), None) + } + DataType::Utf8 => { + to_timestamp_impl::(args, "to_timestamp_nanos") + } + other => { + exec_err!( + "Unsupported data type {:?} for function to_timestamp_nanos", + other + ) + } + } + } +} + +fn to_timestamp_impl>( + args: &[ColumnarValue], + name: &str, +) -> Result { + let factor = match T::UNIT { + Second => 1_000_000_000, + Millisecond => 1_000_000, + Microsecond => 1_000, + Nanosecond => 1, + }; + + match args.len() { + 1 => handle::( + args, + |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor), + name, + ), + n if n >= 2 => handle_multiple::( + args, + string_to_timestamp_nanos_formatted, + |n| n / factor, + name, + ), + _ => exec_err!("Unsupported 0 argument count for function {name}"), + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::{ArrayRef, Int64Array, StringBuilder}; + use arrow::datatypes::TimeUnit; + use arrow_array::types::Int64Type; + use arrow_array::{ + Array, PrimitiveArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, + }; + use chrono::Utc; + + use datafusion_common::{assert_contains, DataFusionError, ScalarValue}; + use datafusion_expr::ScalarFunctionImplementation; + + use crate::datetime::common::string_to_datetime_formatted; + + use super::*; + + fn to_timestamp(args: &[ColumnarValue]) -> Result { + to_timestamp_impl::(args, "to_timestamp") + } + + /// to_timestamp_millis SQL function + fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { + to_timestamp_impl::(args, "to_timestamp_millis") + } + + /// to_timestamp_micros SQL function + fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { + to_timestamp_impl::(args, "to_timestamp_micros") + } + + /// to_timestamp_nanos SQL function + fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { + to_timestamp_impl::(args, "to_timestamp_nanos") + } + + /// to_timestamp_seconds SQL function + fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { + to_timestamp_impl::(args, "to_timestamp_seconds") + } + + #[test] + fn to_timestamp_arrays_and_nulls() -> Result<()> { + // ensure that arrow array implementation is wired up and handles nulls correctly + + let mut string_builder = StringBuilder::with_capacity(2, 1024); + let mut ts_builder = TimestampNanosecondArray::builder(2); + + string_builder.append_value("2020-09-08T13:42:29.190855"); + ts_builder.append_value(1599572549190855000); + + string_builder.append_null(); + ts_builder.append_null(); + let expected_timestamps = &ts_builder.finish() as &dyn Array; + + let string_array = + ColumnarValue::Array(Arc::new(string_builder.finish()) as ArrayRef); + let parsed_timestamps = to_timestamp(&[string_array]) + .expect("that to_timestamp parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 2); + assert_eq!(expected_timestamps, parsed_array.as_ref()); + } else { + panic!("Expected a columnar array") + } + Ok(()) + } + + #[test] + fn to_timestamp_with_formats_arrays_and_nulls() -> Result<()> { + // ensure that arrow array implementation is wired up and handles nulls correctly + + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + let mut format1_builder = StringBuilder::with_capacity(2, 1024); + let mut format2_builder = StringBuilder::with_capacity(2, 1024); + let mut format3_builder = StringBuilder::with_capacity(2, 1024); + let mut ts_builder = TimestampNanosecondArray::builder(2); + + date_string_builder.append_null(); + format1_builder.append_null(); + format2_builder.append_null(); + format3_builder.append_null(); + ts_builder.append_null(); + + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + format1_builder.append_value("%s"); + format2_builder.append_value("%c"); + format3_builder.append_value("%+"); + ts_builder.append_value(1599572549190850000); + + let expected_timestamps = &ts_builder.finish() as &dyn Array; + + let string_array = [ + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; + let parsed_timestamps = to_timestamp(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 2); + assert_eq!(expected_timestamps, parsed_array.as_ref()); + } else { + panic!("Expected a columnar array") + } + Ok(()) + } + + #[test] + fn to_timestamp_invalid_input_type() -> Result<()> { + // pass the wrong type of input array to to_timestamp and test + // that we get an error. + + let mut builder = Int64Array::builder(1); + builder.append_value(1); + let int64array = ColumnarValue::Array(Arc::new(builder.finish())); + + let expected_err = + "Execution error: Unsupported data type Int64 for function to_timestamp"; + match to_timestamp(&[int64array]) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn to_timestamp_with_formats_invalid_input_type() -> Result<()> { + // pass the wrong type of input array to to_timestamp and test + // that we get an error. + + let mut builder = Int64Array::builder(1); + builder.append_value(1); + let int64array = [ + ColumnarValue::Array(Arc::new(builder.finish())), + ColumnarValue::Array(Arc::new(builder.finish())), + ]; + + let expected_err = + "Execution error: Unsupported data type Int64 for function to_timestamp"; + match to_timestamp(&int64array) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn to_timestamp_with_unparseable_data() -> Result<()> { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + + date_string_builder.append_null(); + + date_string_builder.append_value("2020-09-08 - 13:42:29.19085Z"); + + let string_array = + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef); + + let expected_err = + "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time"; + match to_timestamp(&[string_array]) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn to_timestamp_with_no_matching_formats() -> Result<()> { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + let mut format1_builder = StringBuilder::with_capacity(2, 1024); + let mut format2_builder = StringBuilder::with_capacity(2, 1024); + let mut format3_builder = StringBuilder::with_capacity(2, 1024); + + date_string_builder.append_null(); + format1_builder.append_null(); + format2_builder.append_null(); + format3_builder.append_null(); + + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + format1_builder.append_value("%s"); + format2_builder.append_value("%c"); + format3_builder.append_value("%H:%M:%S"); + + let string_array = [ + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; + + let expected_err = + "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters"; + match to_timestamp(&string_array) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn string_to_timestamp_formatted() { + // Explicit timezone + assert_eq!( + 1599572549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855+00:00", "%+").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855Z", "%+").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp_formatted("2020-09-08T13:42:29Z", "%+").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855-05:00", "%+").unwrap() + ); + assert_eq!( + 1599590549000000000, + parse_timestamp_formatted("1599590549", "%s").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp_formatted("09-08-2020 13/42/29", "%m-%d-%Y %H/%M/%S") + .unwrap() + ); + } + + fn parse_timestamp_formatted(s: &str, format: &str) -> Result { + let result = string_to_timestamp_nanos_formatted(s, format); + if let Err(e) = &result { + eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}"); + } + result + } + + #[test] + fn string_to_timestamp_formatted_invalid() { + // Test parsing invalid formats + let cases = [ + ("", "%Y%m%d %H%M%S", "premature end of input"), + ("SS", "%c", "premature end of input"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%XX", + "input contains invalid characters", + ), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%Y%m%d %H%M%S", + "input contains invalid characters", + ), + ]; + + for (s, f, ctx) in cases { + let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); + let actual = string_to_datetime_formatted(&Utc, s, f) + .unwrap_err() + .to_string(); + assert_eq!(actual, expected) + } + } + + #[test] + fn string_to_timestamp_invalid_arguments() { + // Test parsing invalid formats + let cases = [ + ("", "%Y%m%d %H%M%S", "premature end of input"), + ("SS", "%c", "premature end of input"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%XX", + "input contains invalid characters", + ), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%Y%m%d %H%M%S", + "input contains invalid characters", + ), + ]; + + for (s, f, ctx) in cases { + let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); + let actual = string_to_datetime_formatted(&Utc, s, f) + .unwrap_err() + .to_string(); + assert_eq!(actual, expected) + } + } + + #[test] + fn test_to_timestamp_arg_validation() { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + + let data = date_string_builder.finish(); + + let funcs: Vec<(ScalarFunctionImplementation, TimeUnit)> = vec![ + (Arc::new(to_timestamp), Nanosecond), + (Arc::new(to_timestamp_micros), Microsecond), + (Arc::new(to_timestamp_millis), Millisecond), + (Arc::new(to_timestamp_nanos), Nanosecond), + (Arc::new(to_timestamp_seconds), Second), + ]; + + let mut nanos_builder = TimestampNanosecondArray::builder(2); + let mut millis_builder = TimestampMillisecondArray::builder(2); + let mut micros_builder = TimestampMicrosecondArray::builder(2); + let mut sec_builder = TimestampSecondArray::builder(2); + + nanos_builder.append_value(1599572549190850000); + millis_builder.append_value(1599572549190); + micros_builder.append_value(1599572549190850); + sec_builder.append_value(1599572549); + + let nanos_expected_timestamps = &nanos_builder.finish() as &dyn Array; + let millis_expected_timestamps = &millis_builder.finish() as &dyn Array; + let micros_expected_timestamps = µs_builder.finish() as &dyn Array; + let sec_expected_timestamps = &sec_builder.finish() as &dyn Array; + + for (func, time_unit) in funcs { + // test UTF8 + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%s".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%c".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%+".to_string()))), + ]; + let parsed_timestamps = func(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 1); + match time_unit { + Nanosecond => { + assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) + } + Millisecond => { + assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) + } + Microsecond => { + assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) + } + Second => { + assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) + } + }; + } else { + panic!("Expected a columnar array") + } + + // test LargeUTF8 + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%s".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%c".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%+".to_string()))), + ]; + let parsed_timestamps = func(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 1); + match time_unit { + Nanosecond => { + assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) + } + Millisecond => { + assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) + } + Microsecond => { + assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) + } + Second => { + assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) + } + }; + } else { + panic!("Expected a columnar array") + } + + // test other types + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(2))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(3))), + ]; + + let expected = "Unsupported data type Int32 for function".to_string(); + let actual = func(&string_array).unwrap_err().to_string(); + assert_contains!(actual, expected); + + // test other types + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Array(Arc::new(PrimitiveArray::::new( + vec![1i64].into(), + None, + )) as ArrayRef), + ]; + + let expected = "Unsupported data type".to_string(); + let actual = func(&string_array).unwrap_err().to_string(); + assert_contains!(actual, expected); + } + } +} diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index d789287fcc45..1d48dcadbebf 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -90,6 +90,13 @@ pub mod macros; pub mod core; make_stub_package!(core, "core_expressions"); +/// Date and time expressions. +/// Contains functions such as to_timestamp +/// Enabled via feature flag `datetime_expressions` +#[cfg(feature = "datetime_expressions")] +pub mod datetime; +make_stub_package!(datetime, "datetime_expressions"); + /// Encoding expressions. /// Contains Hex and binary `encode` and `decode` functions. /// Enabled via feature flag `encoding_expressions` @@ -113,6 +120,8 @@ make_stub_package!(regex, "regex_expressions"); pub mod expr_fn { #[cfg(feature = "core_expressions")] pub use super::core::expr_fn::*; + #[cfg(feature = "datetime_expressions")] + pub use super::datetime::expr_fn::*; #[cfg(feature = "encoding_expressions")] pub use super::encoding::expr_fn::*; #[cfg(feature = "math_expressions")] @@ -125,6 +134,7 @@ pub mod expr_fn { pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { let mut all_functions = core::functions() .into_iter() + .chain(datetime::functions()) .chain(encoding::functions()) .chain(math::functions()) .chain(regex::functions()); diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index ab62cf8646e8..9175ccb46859 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -19,18 +19,12 @@ use std::ops::Not; -use super::inlist_simplifier::{InListSimplifier, ShortenInListSimplifier}; -use super::utils::*; -use crate::analyzer::type_coercion::TypeCoercionRewriter; -use crate::simplify_expressions::guarantees::GuaranteeRewriter; -use crate::simplify_expressions::regex::simplify_regex_expr; -use crate::simplify_expressions::SimplifyInfo; - use arrow::{ array::{new_null_array, AsArray}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; + use datafusion_common::{ cast::{as_large_list_array, as_list_array}, tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}, @@ -45,6 +39,14 @@ use datafusion_expr::{ use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps}; +use crate::analyzer::type_coercion::TypeCoercionRewriter; +use crate::simplify_expressions::guarantees::GuaranteeRewriter; +use crate::simplify_expressions::regex::simplify_regex_expr; +use crate::simplify_expressions::SimplifyInfo; + +use super::inlist_simplifier::{InListSimplifier, ShortenInListSimplifier}; +use super::utils::*; + /// This structure handles API for expression simplification pub struct ExprSimplifier { info: S, @@ -1331,22 +1333,15 @@ mod tests { sync::Arc, }; - use super::*; - use crate::simplify_expressions::{ - utils::for_test::{cast_to_int64_expr, now_expr, to_timestamp_expr}, - SimplifyContext, - }; - use crate::test::test_table_scan_with_name; - - use arrow::{ - array::{ArrayRef, Int32Array}, - datatypes::{DataType, Field, Schema}, - }; - use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::{assert_contains, DFField, ToDFSchema}; use datafusion_expr::{interval_arithmetic::Interval, *}; use datafusion_physical_expr::execution_props::ExecutionProps; - use chrono::{DateTime, TimeZone, Utc}; + use crate::simplify_expressions::SimplifyContext; + use crate::test::test_table_scan_with_name; + + use super::*; // ------------------------------ // --- ExprSimplifier tests ----- @@ -1427,187 +1422,6 @@ mod tests { assert_eq!(expected, simplifier.simplify(expr).unwrap()); } - // ------------------------------ - // --- ConstEvaluator tests ----- - // ------------------------------ - fn test_evaluate_with_start_time( - input_expr: Expr, - expected_expr: Expr, - date_time: &DateTime, - ) { - let execution_props = - ExecutionProps::new().with_query_execution_start_time(*date_time); - - let mut const_evaluator = ConstEvaluator::try_new(&execution_props).unwrap(); - let evaluated_expr = input_expr - .clone() - .rewrite(&mut const_evaluator) - .expect("successfully evaluated"); - - assert_eq!( - evaluated_expr, expected_expr, - "Mismatch evaluating {input_expr}\n Expected:{expected_expr}\n Got:{evaluated_expr}" - ); - } - - fn test_evaluate(input_expr: Expr, expected_expr: Expr) { - test_evaluate_with_start_time(input_expr, expected_expr, &Utc::now()) - } - - // Make a UDF that adds its two values together, with the specified volatility - fn make_udf_add(volatility: Volatility) -> Arc { - let input_types = vec![DataType::Int32, DataType::Int32]; - let return_type = Arc::new(DataType::Int32); - - let fun = Arc::new(|args: &[ColumnarValue]| { - let args = ColumnarValue::values_to_arrays(args)?; - - let arg0 = as_int32_array(&args[0])?; - let arg1 = as_int32_array(&args[1])?; - - // 2. perform the computation - let array = arg0 - .iter() - .zip(arg1.iter()) - .map(|args| { - if let (Some(arg0), Some(arg1)) = args { - Some(arg0 + arg1) - } else { - // one or both args were Null - None - } - }) - .collect::(); - - Ok(ColumnarValue::from(Arc::new(array) as ArrayRef)) - }); - - Arc::new(create_udf( - "udf_add", - input_types, - return_type, - volatility, - fun, - )) - } - - #[test] - fn test_const_evaluator() { - // true --> true - test_evaluate(lit(true), lit(true)); - // true or true --> true - test_evaluate(lit(true).or(lit(true)), lit(true)); - // true or false --> true - test_evaluate(lit(true).or(lit(false)), lit(true)); - - // "foo" == "foo" --> true - test_evaluate(lit("foo").eq(lit("foo")), lit(true)); - // "foo" != "foo" --> false - test_evaluate(lit("foo").not_eq(lit("foo")), lit(false)); - - // c = 1 --> c = 1 - test_evaluate(col("c").eq(lit(1)), col("c").eq(lit(1))); - // c = 1 + 2 --> c + 3 - test_evaluate(col("c").eq(lit(1) + lit(2)), col("c").eq(lit(3))); - // (foo != foo) OR (c = 1) --> false OR (c = 1) - test_evaluate( - (lit("foo").not_eq(lit("foo"))).or(col("c").eq(lit(1))), - lit(false).or(col("c").eq(lit(1))), - ); - } - - #[test] - fn test_const_evaluator_scalar_functions() { - // concat("foo", "bar") --> "foobar" - let expr = call_fn("concat", vec![lit("foo"), lit("bar")]).unwrap(); - test_evaluate(expr, lit("foobar")); - - // ensure arguments are also constant folded - // concat("foo", concat("bar", "baz")) --> "foobarbaz" - let concat1 = call_fn("concat", vec![lit("bar"), lit("baz")]).unwrap(); - let expr = call_fn("concat", vec![lit("foo"), concat1]).unwrap(); - test_evaluate(expr, lit("foobarbaz")); - - // Check non string arguments - // to_timestamp("2020-09-08T12:00:00+00:00") --> timestamp(1599566400i64) - let expr = - call_fn("to_timestamp", vec![lit("2020-09-08T12:00:00+00:00")]).unwrap(); - test_evaluate(expr, lit_timestamp_nano(1599566400000000000i64)); - - // check that non foldable arguments are folded - // to_timestamp(a) --> to_timestamp(a) [no rewrite possible] - let expr = call_fn("to_timestamp", vec![col("a")]).unwrap(); - test_evaluate(expr.clone(), expr); - - // volatile / stable functions should not be evaluated - // rand() + (1 + 2) --> rand() + 3 - let fun = BuiltinScalarFunction::Random; - assert_eq!(fun.volatility(), Volatility::Volatile); - let rand = Expr::ScalarFunction(ScalarFunction::new(fun, vec![])); - let expr = rand.clone() + (lit(1) + lit(2)); - let expected = rand + lit(3); - test_evaluate(expr, expected); - - // parenthesization matters: can't rewrite - // (rand() + 1) + 2 --> (rand() + 1) + 2) - let fun = BuiltinScalarFunction::Random; - let rand = Expr::ScalarFunction(ScalarFunction::new(fun, vec![])); - let expr = (rand + lit(1)) + lit(2); - test_evaluate(expr.clone(), expr); - } - - #[test] - fn test_const_evaluator_now() { - let ts_nanos = 1599566400000000000i64; - let time = chrono::Utc.timestamp_nanos(ts_nanos); - let ts_string = "2020-09-08T12:05:00+00:00"; - // now() --> ts - test_evaluate_with_start_time(now_expr(), lit_timestamp_nano(ts_nanos), &time); - - // CAST(now() as int64) + 100_i64 --> ts + 100_i64 - let expr = cast_to_int64_expr(now_expr()) + lit(100_i64); - test_evaluate_with_start_time(expr, lit(ts_nanos + 100), &time); - - // CAST(now() as int64) < cast(to_timestamp(...) as int64) + 50000_i64 ---> true - let expr = cast_to_int64_expr(now_expr()) - .lt(cast_to_int64_expr(to_timestamp_expr(ts_string)) + lit(50000i64)); - test_evaluate_with_start_time(expr, lit(true), &time); - } - - #[test] - fn test_evaluator_udfs() { - let args = vec![lit(1) + lit(2), lit(30) + lit(40)]; - let folded_args = vec![lit(3), lit(70)]; - - // immutable UDF should get folded - // udf_add(1+2, 30+40) --> 73 - let expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( - make_udf_add(Volatility::Immutable), - args.clone(), - )); - test_evaluate(expr, lit(73)); - - // stable UDF should be entirely folded - // udf_add(1+2, 30+40) --> 73 - let fun = make_udf_add(Volatility::Stable); - let expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( - Arc::clone(&fun), - args.clone(), - )); - test_evaluate(expr, lit(73)); - - // volatile UDF should have args folded - // udf_add(1+2, 30+40) --> udf_add(3, 70) - let fun = make_udf_add(Volatility::Volatile); - let expr = - Expr::ScalarFunction(expr::ScalarFunction::new_udf(Arc::clone(&fun), args)); - let expected_expr = Expr::ScalarFunction(expr::ScalarFunction::new_udf( - Arc::clone(&fun), - folded_args, - )); - test_evaluate(expr, expected_expr); - } - // ------------------------------ // --- Simplifier tests ----- // ------------------------------ diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index f36cd8f838fb..ddb754a919bd 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -19,13 +19,15 @@ use std::sync::Arc; -use super::{ExprSimplifier, SimplifyContext}; -use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DFSchema, DFSchemaRef, Result}; use datafusion_expr::logical_plan::LogicalPlan; use datafusion_expr::utils::merge_schema; use datafusion_physical_expr::execution_props::ExecutionProps; +use crate::{OptimizerConfig, OptimizerRule}; + +use super::{ExprSimplifier, SimplifyContext}; + /// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting /// [`Expr`]`s evaluating constants and applying algebraic /// simplifications @@ -132,24 +134,22 @@ impl SimplifyExpressions { mod tests { use std::ops::Not; - use crate::simplify_expressions::utils::for_test::{ - cast_to_int64_expr, now_expr, to_timestamp_expr, - }; - use crate::test::{assert_fields_eq, test_table_scan_with_name}; - - use super::*; use arrow::datatypes::{DataType, Field, Schema}; - use chrono::{DateTime, TimeZone, Utc}; - use datafusion_common::ScalarValue; - use datafusion_expr::logical_plan::builder::table_scan_with_filters; - use datafusion_expr::{call_fn, or, BinaryExpr, Cast, Operator}; + use chrono::{DateTime, Utc}; - use crate::OptimizerContext; + use datafusion_expr::logical_plan::builder::table_scan_with_filters; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ and, binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Expr, ExprSchemable, JoinType, }; + use datafusion_expr::{call_fn, or, BinaryExpr, Cast, Operator}; + + use crate::simplify_expressions::utils::for_test::now_expr; + use crate::test::{assert_fields_eq, test_table_scan_with_name}; + use crate::OptimizerContext; + + use super::*; fn test_table_scan() -> LogicalPlan { let schema = Schema::new(vec![ @@ -430,23 +430,6 @@ mod tests { format!("{optimized_plan:?}") } - #[test] - fn to_timestamp_expr_folded() -> Result<()> { - let table_scan = test_table_scan(); - let proj = vec![to_timestamp_expr("2020-09-08T12:00:00+00:00")]; - - let plan = LogicalPlanBuilder::from(table_scan) - .project(proj)? - .build()?; - - let expected = "Projection: TimestampNanosecond(1599566400000000000, None) AS to_timestamp(Utf8(\"2020-09-08T12:00:00+00:00\"))\ - \n TableScan: test" - .to_string(); - let actual = get_optimized_plan_formatted(&plan, &Utc::now()); - assert_eq!(expected, actual); - Ok(()) - } - #[test] fn cast_expr() -> Result<()> { let table_scan = test_table_scan(); @@ -505,59 +488,6 @@ mod tests { Ok(()) } - #[test] - fn now_less_than_timestamp() -> Result<()> { - let table_scan = test_table_scan(); - - let ts_string = "2020-09-08T12:05:00+00:00"; - let time = Utc.timestamp_nanos(1599566400000000000i64); - - // cast(now() as int) < cast(to_timestamp(...) as int) + 50000_i64 - let plan = - LogicalPlanBuilder::from(table_scan) - .filter(cast_to_int64_expr(now_expr()).lt(cast_to_int64_expr( - to_timestamp_expr(ts_string), - ) + lit(50000_i64)))? - .build()?; - - // Note that constant folder runs and folds the entire - // expression down to a single constant (true) - let expected = "Filter: Boolean(true)\ - \n TableScan: test"; - let actual = get_optimized_plan_formatted(&plan, &time); - - assert_eq!(expected, actual); - Ok(()) - } - - #[test] - fn select_date_plus_interval() -> Result<()> { - let table_scan = test_table_scan(); - - let ts_string = "2020-09-08T12:05:00+00:00"; - let time = Utc.timestamp_nanos(1599566400000000000i64); - - // now() < cast(to_timestamp(...) as int) + 5000000000 - let schema = table_scan.schema(); - - let date_plus_interval_expr = to_timestamp_expr(ts_string) - .cast_to(&DataType::Date32, schema)? - + Expr::Literal(ScalarValue::IntervalDayTime(Some(123i64 << 32))); - - let plan = LogicalPlanBuilder::from(table_scan.clone()) - .project(vec![date_plus_interval_expr])? - .build()?; - - // Note that constant folder runs and folds the entire - // expression down to a single constant (true) - let expected = r#"Projection: Date32("18636") AS to_timestamp(Utf8("2020-09-08T12:05:00+00:00")) + IntervalDayTime("528280977408") - TableScan: test"#; - let actual = get_optimized_plan_formatted(&plan, &time); - - assert_eq!(expected, actual); - Ok(()) - } - #[test] fn simplify_not_binary() -> Result<()> { let table_scan = test_table_scan(); diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs index c9736061df90..4d3b123bace0 100644 --- a/datafusion/optimizer/src/simplify_expressions/utils.rs +++ b/datafusion/optimizer/src/simplify_expressions/utils.rs @@ -534,18 +534,9 @@ pub fn simpl_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result { #[cfg(test)] pub mod for_test { - use arrow::datatypes::DataType; - use datafusion_expr::{call_fn, lit, Cast, Expr}; + use datafusion_expr::{call_fn, Expr}; pub fn now_expr() -> Expr { call_fn("now", vec![]).unwrap() } - - pub fn cast_to_int64_expr(expr: Expr) -> Expr { - Expr::Cast(Cast::new(expr.into(), DataType::Int64)) - } - - pub fn to_timestamp_expr(arg: impl Into) -> Expr { - call_fn("to_timestamp", vec![lit(arg.into())]).unwrap() - } } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index d48361ed15a9..0ff7bd595c5b 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -90,10 +90,6 @@ name = "in_list" harness = false name = "make_date" -[[bench]] -harness = false -name = "to_timestamp" - [[bench]] harness = false name = "regexp" diff --git a/datafusion/physical-expr/benches/to_timestamp.rs b/datafusion/physical-expr/benches/to_timestamp.rs deleted file mode 100644 index 1934f69ef11c..000000000000 --- a/datafusion/physical-expr/benches/to_timestamp.rs +++ /dev/null @@ -1,125 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate criterion; - -use std::sync::Arc; - -use arrow_array::builder::StringBuilder; -use arrow_array::ArrayRef; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::datetime_expressions::to_timestamp; - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("to_timestamp_no_formats", |b| { - let mut inputs = StringBuilder::new(); - inputs.append_value("1997-01-31T09:26:56.123Z"); - inputs.append_value("1997-01-31T09:26:56.123-05:00"); - inputs.append_value("1997-01-31 09:26:56.123-05:00"); - inputs.append_value("2023-01-01 04:05:06.789 -08"); - inputs.append_value("1997-01-31T09:26:56.123"); - inputs.append_value("1997-01-31 09:26:56.123"); - inputs.append_value("1997-01-31 09:26:56"); - inputs.append_value("1997-01-31 13:26:56"); - inputs.append_value("1997-01-31 13:26:56+04:00"); - inputs.append_value("1997-01-31"); - - let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp(&[string_array.clone()]) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats", |b| { - let mut inputs = StringBuilder::new(); - let mut format1_builder = StringBuilder::with_capacity(2, 10); - let mut format2_builder = StringBuilder::with_capacity(2, 10); - let mut format3_builder = StringBuilder::with_capacity(2, 10); - - inputs.append_value("1997-01-31T09:26:56.123Z"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); - - inputs.append_value("1997-01-31T09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); - - inputs.append_value("1997-01-31 09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); - - inputs.append_value("2023-01-01 04:05:06.789 -08"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); - - inputs.append_value("1997-01-31T09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); - - inputs.append_value("1997-01-31 09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); - - inputs.append_value("1997-01-31 09:26:56"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S"); - - inputs.append_value("1997-01-31 092656"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H%M%S"); - - inputs.append_value("1997-01-31 092656+04:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); - - inputs.append_value("Sun Jul 8 00:34:60 2001"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d 00:00:00"); - - let args = [ - ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), - ]; - b.iter(|| { - black_box( - to_timestamp(&args.clone()) - .expect("to_timestamp should work on valid values"), - ) - }) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index fd57485555c6..3b322ae2692f 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -24,12 +24,11 @@ use std::sync::Arc; use arrow::compute::cast; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; use arrow::{ - array::{Array, ArrayRef, Float64Array, OffsetSizeTrait, PrimitiveArray}, - compute::kernels::cast_utils::string_to_timestamp_nanos, + array::{Array, ArrayRef, Float64Array, PrimitiveArray}, datatypes::{ - ArrowNumericType, ArrowPrimitiveType, ArrowTemporalType, DataType, - IntervalDayTimeType, IntervalMonthDayNanoType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, + ArrowNumericType, ArrowTemporalType, DataType, IntervalDayTimeType, + IntervalMonthDayNanoType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, }, }; use arrow::{ @@ -42,497 +41,18 @@ use arrow_array::cast::AsArray; use arrow_array::temporal_conversions::NANOSECONDS; use arrow_array::timezone::Tz; use arrow_array::types::{ArrowTimestampType, Date32Type, Int32Type}; -use arrow_array::{GenericStringArray, StringArray}; +use arrow_array::StringArray; use chrono::prelude::*; -use chrono::LocalResult::Single; use chrono::{Duration, LocalResult, Months, NaiveDate}; -use itertools::Either; use datafusion_common::cast::{ - as_date32_array, as_date64_array, as_generic_string_array, as_primitive_array, - as_timestamp_microsecond_array, as_timestamp_millisecond_array, - as_timestamp_nanosecond_array, as_timestamp_second_array, -}; -use datafusion_common::{ - exec_err, internal_datafusion_err, not_impl_err, DataFusionError, Result, ScalarType, - ScalarValue, + as_date32_array, as_date64_array, as_primitive_array, as_timestamp_microsecond_array, + as_timestamp_millisecond_array, as_timestamp_nanosecond_array, + as_timestamp_second_array, }; +use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result, ScalarValue}; use datafusion_expr::ColumnarValue; -use crate::expressions::cast_column; - -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -/// Accepts a string with a `chrono` format and converts it to a -/// nanosecond precision timestamp. -/// -/// See [`chrono::format::strftime`] for the full set of supported formats. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// Any timestamp in the formatting string is handled according to the rules -/// defined by `chrono`. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -/// -#[inline] -pub(crate) fn string_to_timestamp_nanos_formatted( - s: &str, - format: &str, -) -> Result { - string_to_datetime_formatted(&Utc, s, format)? - .naive_utc() - .timestamp_nanos_opt() - .ok_or_else(|| { - DataFusionError::Execution(ERR_NANOSECONDS_NOT_SUPPORTED.to_string()) - }) -} - -/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers -/// relative to the provided `timezone` -/// -/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled -/// -/// * `2023-01-01 040506 America/Los_Angeles` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -/// [IANA timezones]: https://www.iana.org/time-zones -pub(crate) fn string_to_datetime_formatted( - timezone: &T, - s: &str, - format: &str, -) -> Result, DataFusionError> { - let err = |err_ctx: &str| { - DataFusionError::Execution(format!( - "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}" - )) - }; - - // attempt to parse the string assuming it has a timezone - let dt = DateTime::parse_from_str(s, format); - - if let Err(e) = &dt { - // no timezone or other failure, try without a timezone - let ndt = NaiveDateTime::parse_from_str(s, format); - if let Err(e) = &ndt { - return Err(err(&e.to_string())); - } - - if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) { - Ok(e.to_owned()) - } else { - Err(err(&e.to_string())) - } - } else { - Ok(dt.unwrap().with_timezone(timezone)) - } -} - -/// given a function `op` that maps a `&str` to a Result of an arrow native type, -/// returns a `PrimitiveArray` after the application -/// of the function to `args[0]`. -/// # Errors -/// This function errors iff: -/// * the number of arguments is not 1 or -/// * the first argument is not castable to a `GenericStringArray` or -/// * the function `op` errors -pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>( - args: &[&'a dyn Array], - op: F, - name: &str, -) -> Result> -where - O: ArrowPrimitiveType, - T: OffsetSizeTrait, - F: Fn(&'a str) -> Result, -{ - if args.len() != 1 { - return exec_err!( - "{:?} args were supplied but {} takes exactly one argument", - args.len(), - name - ); - } - - let array = as_generic_string_array::(args[0])?; - - // first map is the iterator, second is for the `Option<_>` - array.iter().map(|x| x.map(&op).transpose()).collect() -} - -/// given a function `op` that maps `&str`, `&str` to the first successful Result -/// of an arrow native type, returns a `PrimitiveArray` after the application of the -/// function to `args` and the subsequence application of the `op2` function to any -/// successful result. This function calls the `op` function with the first and second -/// argument and if not successful continues with first and third, first and fourth, -/// etc until the result was successful or no more arguments are present. -/// # Errors -/// This function errors iff: -/// * the number of arguments is not > 1 or -/// * the array arguments are not castable to a `GenericStringArray` or -/// * the function `op` errors for all input -pub(crate) fn strings_to_primitive_function<'a, T, O, F, F2>( - args: &'a [ColumnarValue], - op: F, - op2: F2, - name: &str, -) -> Result> -where - O: ArrowPrimitiveType, - T: OffsetSizeTrait, - F: Fn(&'a str, &'a str) -> Result, - F2: Fn(O::Native) -> O::Native, -{ - if args.len() < 2 { - return exec_err!( - "{:?} args were supplied but {} takes 2 or more arguments", - args.len(), - name - ); - } - - // this will throw the error if any of the array args are not castable to GenericStringArray - let data = args - .iter() - .map(|a| match a { - ColumnarValue::Array(a) => { - Ok(Either::Left(as_generic_string_array::(a.as_ref())?)) - } - ColumnarValue::Scalar(s) => match s { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => Ok(Either::Right(a)), - other => exec_err!( - "Unexpected scalar type encountered '{other}' for function '{name}'" - ), - }, - }) - .collect::, &Option>>>>()?; - - let first_arg = &data.first().unwrap().left().unwrap(); - - first_arg - .iter() - .enumerate() - .map(|(pos, x)| { - let mut val = None; - - if let Some(x) = x { - let param_args = data.iter().skip(1); - - // go through the args and find the first successful result. Only the last - // failure will be returned if no successful result was received. - for param_arg in param_args { - // param_arg is an array, use the corresponding index into the array as the arg - // we're currently parsing - let p = *param_arg; - let r = if p.is_left() { - let p = p.left().unwrap(); - op(x, p.value(pos)) - } - // args is a scalar, use it directly - else if let Some(p) = p.right().unwrap() { - op(x, p.as_str()) - } else { - continue; - }; - - if r.is_ok() { - val = Some(Ok(op2(r.unwrap()))); - break; - } else { - val = Some(r); - } - } - }; - - val.transpose() - }) - .collect() -} - -// given an function that maps a `&str` to an arrow native type, -// returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` -// depending on the `args`'s variant. -fn handle<'a, O, F, S>( - args: &'a [ColumnarValue], - op: F, - name: &str, -) -> Result -where - O: ArrowPrimitiveType, - S: ScalarType, - F: Fn(&'a str) -> Result, -{ - match &args[0] { - ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, - ))), - other => exec_err!("Unsupported data type {other:?} for function {name}"), - }, - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| (op)(x)).transpose()?; - Ok(ColumnarValue::Scalar(S::scalar(result))) - } - other => exec_err!("Unsupported data type {other:?} for function {name}"), - }, - } -} - -// given an function that maps a `&str`, `&str` to an arrow native type, -// returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` -// depending on the `args`'s variant. -fn handle_multiple<'a, O, F, S, M>( - args: &'a [ColumnarValue], - op: F, - op2: M, - name: &str, -) -> Result -where - O: ArrowPrimitiveType, - S: ScalarType, - F: Fn(&'a str, &'a str) -> Result, - M: Fn(O::Native) -> O::Native, -{ - match &args[0] { - ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - // validate the column types - for (pos, arg) in args.iter().enumerate() { - match arg { - ColumnarValue::Array(arg) => match arg.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - // all good - }, - other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), - }, - ColumnarValue::Scalar(arg) => { match arg.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - // all good - }, - other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), - }} - } - } - - Ok(ColumnarValue::Array(Arc::new( - strings_to_primitive_function::(args, op, op2, name)?, - ))) - } - other => { - exec_err!("Unsupported data type {other:?} for function {name}") - } - }, - // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 - ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { - let mut val: Option> = None; - let mut err: Option = None; - - match a { - Some(a) => { - // enumerate all the values finding the first one that returns an Ok result - for (pos, v) in args.iter().enumerate().skip(1) { - if let ColumnarValue::Scalar(s) = v { - if let ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x) = - s - { - if let Some(s) = x { - match op(a.as_str(), s.as_str()) { - Ok(r) => { - val = Some(Ok(ColumnarValue::Scalar( - S::scalar(Some(op2(r))), - ))); - break; - } - Err(e) => { - err = Some(e); - } - } - } - } else { - return exec_err!("Unsupported data type {s:?} for function {name}, arg # {pos}"); - } - } else { - return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); - } - } - } - None => (), - } - - if let Some(v) = val { - v - } else { - Err(err.unwrap()) - } - } - other => { - exec_err!("Unsupported data type {other:?} for function {name}") - } - }, - } -} - -/// Calls string_to_timestamp_nanos and converts the error type -fn string_to_timestamp_nanos_shim(s: &str) -> Result { - string_to_timestamp_nanos(s).map_err(|e| e.into()) -} - -fn to_timestamp_impl>( - args: &[ColumnarValue], - name: &str, -) -> Result { - let factor = match T::UNIT { - TimeUnit::Second => 1_000_000_000, - TimeUnit::Millisecond => 1_000_000, - TimeUnit::Microsecond => 1_000, - TimeUnit::Nanosecond => 1, - }; - - match args.len() { - 1 => handle::( - args, - |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor), - name, - ), - n if n >= 2 => handle_multiple::( - args, - string_to_timestamp_nanos_formatted, - |n| n / factor, - name, - ), - _ => exec_err!("Unsupported 0 argument count for function {name}"), - } -} - -/// # Examples -/// -/// ```ignore -/// # use std::sync::Arc; - -/// # use datafusion::arrow::array::StringArray; -/// # use datafusion::arrow::datatypes::{DataType, Field, Schema}; -/// # use datafusion::arrow::record_batch::RecordBatch; -/// # use datafusion::error::Result; -/// # use datafusion::prelude::*; - -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// // define a schema. -/// let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); - -/// // define data. -/// let batch = RecordBatch::try_new( -/// schema, -/// vec![Arc::new(StringArray::from(vec![ -/// "2020-09-08T13:42:29Z", -/// "2020-09-08T13:42:29.190855-05:00", -/// "2020-08-09 12:13:29", -/// "2020-01-02", -/// ]))], -/// )?; - -/// // declare a new context. In spark API, this corresponds to a new spark SQLsession -/// let ctx = SessionContext::new(); - -/// // declare a table in memory. In spark API, this corresponds to createDataFrame(...). -/// ctx.register_batch("t", batch)?; -/// let df = ctx.table("t").await?; - -/// // use to_date function to convert col 'a' to timestamp type using the default parsing -/// let df = df.with_column("a", to_date(vec![col("a")]))?; - -/// let df = df.select_columns(&["a"])?; - -/// // print the results -/// df.show().await?; - -/// # Ok(()) -/// # } -/// ``` -pub fn to_date(args: &[ColumnarValue]) -> Result { - match args.len() { - 1 => handle::( - args, - |s| { - string_to_timestamp_nanos_shim(s) - .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) - .and_then(|v| { - v.try_into().map_err(|_| { - internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") - }) - }) - }, - "to_date", - ), - n if n >= 2 => handle_multiple::( - args, - |s, format| { - string_to_timestamp_nanos_formatted(s, format) - .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) - .and_then(|v| { - v.try_into().map_err(|_| { - internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") - }) - }) - }, - |n| n, - "to_date", - ), - _ => exec_err!("Unsupported 0 argument count for function to_date"), - } -} - -/// to_timestamp SQL function -/// -/// Note: `to_timestamp` returns `Timestamp(Nanosecond)` though its arguments are interpreted as **seconds**. -/// The supported range for integer input is between `-9223372037` and `9223372036`. -/// Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. -/// Please use `to_timestamp_seconds` for the input outside of supported bounds. -pub fn to_timestamp(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp") -} - -/// to_timestamp_millis SQL function -pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_millis") -} - -/// to_timestamp_micros SQL function -pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_micros") -} - -/// to_timestamp_nanos SQL function -pub fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_nanos") -} - -/// to_timestamp_seconds SQL function -pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { - to_timestamp_impl::(args, "to_timestamp_seconds") -} - /// Create an implementation of `now()` that always returns the /// specified timestamp. /// @@ -806,9 +326,9 @@ pub fn make_date(args: &[ColumnarValue]) -> Result { let is_scalar = len.is_none(); let array_size = if is_scalar { 1 } else { len.unwrap() }; - let years = cast_column(&args[0], &DataType::Int32, None)?; - let months = cast_column(&args[1], &DataType::Int32, None)?; - let days = cast_column(&args[2], &DataType::Int32, None)?; + let years = args[0].cast_to(&DataType::Int32, None)?; + let months = args[1].cast_to(&DataType::Int32, None)?; + let days = args[2].cast_to(&DataType::Int32, None)?; // since the epoch for the date32 datatype is the unix epoch // we need to subtract the unix epoch from the current date @@ -901,17 +421,14 @@ pub fn make_date(args: &[ColumnarValue]) -> Result { fn quarter_month(date: &T) -> u32 where - T: chrono::Datelike, + T: Datelike, { 1 + 3 * ((date.month() - 1) / 3) } fn _date_trunc_coarse(granularity: &str, value: Option) -> Result> where - T: chrono::Datelike - + chrono::Timelike - + std::ops::Sub - + std::marker::Copy, + T: Datelike + Timelike + Sub + Copy, { let value = match granularity { "millisecond" => value, @@ -1624,238 +1141,6 @@ where Ok(b) } -fn validate_to_timestamp_data_types( - args: &[ColumnarValue], - name: &str, -) -> Option> { - for (idx, a) in args.iter().skip(1).enumerate() { - match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - // all good - } - _ => { - return Some(exec_err!( - "{name} function unsupported data type at index {}: {}", - idx + 1, - a.data_type() - )); - } - } - } - - None -} - -/// to_date SQL function implementation -pub fn to_date_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_date function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = validate_to_timestamp_data_types(args, "to_date") { - return value; - } - } - - match args[0].data_type() { - DataType::Int32 - | DataType::Int64 - | DataType::Null - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 => cast_column(&args[0], &DataType::Date32, None), - DataType::Utf8 => to_date(args), - other => { - exec_err!("Unsupported data type {:?} for function to_date", other) - } - } -} - -/// to_timestamp() SQL function implementation -pub fn to_timestamp_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_timestamp function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp") { - return value; - } - } - - match args[0].data_type() { - DataType::Int32 | DataType::Int64 => cast_column( - &cast_column(&args[0], &DataType::Timestamp(TimeUnit::Second, None), None)?, - &DataType::Timestamp(TimeUnit::Nanosecond, None), - None, - ), - DataType::Null | DataType::Float64 | DataType::Timestamp(_, None) => cast_column( - &args[0], - &DataType::Timestamp(TimeUnit::Nanosecond, None), - None, - ), - DataType::Utf8 => to_timestamp(args), - other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp", - other - ) - } - } -} - -/// to_timestamp_millis() SQL function implementation -pub fn to_timestamp_millis_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_timestamp_millis function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_millis") - { - return value; - } - } - - match args[0].data_type() { - DataType::Null - | DataType::Int32 - | DataType::Int64 - | DataType::Timestamp(_, None) => cast_column( - &args[0], - &DataType::Timestamp(TimeUnit::Millisecond, None), - None, - ), - DataType::Utf8 => to_timestamp_millis(args), - other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp_millis", - other - ) - } - } -} - -/// to_timestamp_micros() SQL function implementation -pub fn to_timestamp_micros_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_timestamp_micros function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_micros") - { - return value; - } - } - - match args[0].data_type() { - DataType::Null - | DataType::Int32 - | DataType::Int64 - | DataType::Timestamp(_, None) => cast_column( - &args[0], - &DataType::Timestamp(TimeUnit::Microsecond, None), - None, - ), - DataType::Utf8 => to_timestamp_micros(args), - other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp_micros", - other - ) - } - } -} - -/// to_timestamp_nanos() SQL function implementation -pub fn to_timestamp_nanos_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_timestamp_nanos function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_nanos") - { - return value; - } - } - - match args[0].data_type() { - DataType::Null - | DataType::Int32 - | DataType::Int64 - | DataType::Timestamp(_, None) => cast_column( - &args[0], - &DataType::Timestamp(TimeUnit::Nanosecond, None), - None, - ), - DataType::Utf8 => to_timestamp_nanos(args), - other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp_nanos", - other - ) - } - } -} - -/// to_timestamp_seconds() SQL function implementation -pub fn to_timestamp_seconds_invoke(args: &[ColumnarValue]) -> Result { - if args.is_empty() { - return exec_err!( - "to_timestamp_seconds function requires 1 or more arguments, got {}", - args.len() - ); - } - - // validate that any args after the first one are Utf8 - if args.len() > 1 { - if let Some(value) = - validate_to_timestamp_data_types(args, "to_timestamp_seconds") - { - return value; - } - } - - match args[0].data_type() { - DataType::Null - | DataType::Int32 - | DataType::Int64 - | DataType::Timestamp(_, None) => { - cast_column(&args[0], &DataType::Timestamp(TimeUnit::Second, None), None) - } - DataType::Utf8 => to_timestamp_seconds(args), - other => { - exec_err!( - "Unsupported data type {:?} for function to_timestamp_seconds", - other - ) - } - } -} - /// from_unixtime() SQL function implementation pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { if args.len() != 1 { @@ -1867,7 +1152,7 @@ pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { match args[0].data_type() { DataType::Int64 => { - cast_column(&args[0], &DataType::Timestamp(TimeUnit::Second, None), None) + args[0].cast_to(&DataType::Timestamp(TimeUnit::Second, None), None) } other => { exec_err!( @@ -1882,90 +1167,18 @@ pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { mod tests { use std::sync::Arc; - use arrow::array::{ - as_primitive_array, ArrayRef, Int64Array, IntervalDayTimeArray, StringBuilder, - }; - use arrow_array::types::Int64Type; + use arrow::array::{as_primitive_array, ArrayRef, Int64Array, IntervalDayTimeArray}; + use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow_array::{ Date32Array, Date64Array, Int32Array, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt32Array, }; - - use datafusion_common::assert_contains; - use datafusion_expr::ScalarFunctionImplementation; + use datafusion_common::ScalarValue; use super::*; - #[test] - fn to_timestamp_arrays_and_nulls() -> Result<()> { - // ensure that arrow array implementation is wired up and handles nulls correctly - - let mut string_builder = StringBuilder::with_capacity(2, 1024); - let mut ts_builder = TimestampNanosecondArray::builder(2); - - string_builder.append_value("2020-09-08T13:42:29.190855"); - ts_builder.append_value(1599572549190855000); - - string_builder.append_null(); - ts_builder.append_null(); - let expected_timestamps = &ts_builder.finish() as &dyn Array; - - let string_array = - ColumnarValue::Array(Arc::new(string_builder.finish()) as ArrayRef); - let parsed_timestamps = to_timestamp(&[string_array]) - .expect("that to_timestamp parsed values without error"); - if let ColumnarValue::Array(parsed_array) = parsed_timestamps { - assert_eq!(parsed_array.len(), 2); - assert_eq!(expected_timestamps, parsed_array.as_ref()); - } else { - panic!("Expected a columnar array") - } - Ok(()) - } - - #[test] - fn to_timestamp_with_formats_arrays_and_nulls() -> Result<()> { - // ensure that arrow array implementation is wired up and handles nulls correctly - - let mut date_string_builder = StringBuilder::with_capacity(2, 1024); - let mut format1_builder = StringBuilder::with_capacity(2, 1024); - let mut format2_builder = StringBuilder::with_capacity(2, 1024); - let mut format3_builder = StringBuilder::with_capacity(2, 1024); - let mut ts_builder = TimestampNanosecondArray::builder(2); - - date_string_builder.append_null(); - format1_builder.append_null(); - format2_builder.append_null(); - format3_builder.append_null(); - ts_builder.append_null(); - - date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); - format1_builder.append_value("%s"); - format2_builder.append_value("%c"); - format3_builder.append_value("%+"); - ts_builder.append_value(1599572549190850000); - - let expected_timestamps = &ts_builder.finish() as &dyn Array; - - let string_array = [ - ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), - ]; - let parsed_timestamps = to_timestamp(&string_array) - .expect("that to_timestamp with format args parsed values without error"); - if let ColumnarValue::Array(parsed_array) = parsed_timestamps { - assert_eq!(parsed_array.len(), 2); - assert_eq!(expected_timestamps, parsed_array.as_ref()); - } else { - panic!("Expected a columnar array") - } - Ok(()) - } - #[test] fn date_trunc_test() { let cases = vec![ @@ -2706,328 +1919,6 @@ mod tests { }); } - #[test] - fn to_timestamp_invalid_input_type() -> Result<()> { - // pass the wrong type of input array to to_timestamp and test - // that we get an error. - - let mut builder = Int64Array::builder(1); - builder.append_value(1); - let int64array = ColumnarValue::Array(Arc::new(builder.finish())); - - let expected_err = - "Execution error: Unsupported data type Int64 for function to_timestamp"; - match to_timestamp(&[int64array]) { - Ok(_) => panic!("Expected error but got success"), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}'. Actual error '{e}'" - ); - } - } - Ok(()) - } - - #[test] - fn to_timestamp_with_formats_invalid_input_type() -> Result<()> { - // pass the wrong type of input array to to_timestamp and test - // that we get an error. - - let mut builder = Int64Array::builder(1); - builder.append_value(1); - let int64array = [ - ColumnarValue::Array(Arc::new(builder.finish())), - ColumnarValue::Array(Arc::new(builder.finish())), - ]; - - let expected_err = - "Execution error: Unsupported data type Int64 for function to_timestamp"; - match to_timestamp(&int64array) { - Ok(_) => panic!("Expected error but got success"), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}'. Actual error '{e}'" - ); - } - } - Ok(()) - } - - #[test] - fn to_timestamp_with_unparseable_data() -> Result<()> { - let mut date_string_builder = StringBuilder::with_capacity(2, 1024); - - date_string_builder.append_null(); - - date_string_builder.append_value("2020-09-08 - 13:42:29.19085Z"); - - let string_array = - ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef); - - let expected_err = - "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time"; - match to_timestamp(&[string_array]) { - Ok(_) => panic!("Expected error but got success"), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}'. Actual error '{e}'" - ); - } - } - Ok(()) - } - - #[test] - fn to_timestamp_with_no_matching_formats() -> Result<()> { - let mut date_string_builder = StringBuilder::with_capacity(2, 1024); - let mut format1_builder = StringBuilder::with_capacity(2, 1024); - let mut format2_builder = StringBuilder::with_capacity(2, 1024); - let mut format3_builder = StringBuilder::with_capacity(2, 1024); - - date_string_builder.append_null(); - format1_builder.append_null(); - format2_builder.append_null(); - format3_builder.append_null(); - - date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); - format1_builder.append_value("%s"); - format2_builder.append_value("%c"); - format3_builder.append_value("%H:%M:%S"); - - let string_array = [ - ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), - ]; - - let expected_err = - "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters"; - match to_timestamp(&string_array) { - Ok(_) => panic!("Expected error but got success"), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}'. Actual error '{e}'" - ); - } - } - Ok(()) - } - - #[test] - fn string_to_timestamp_formatted() { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp_formatted("2020-09-08T13:42:29.190855+00:00", "%+").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp_formatted("2020-09-08T13:42:29.190855Z", "%+").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp_formatted("2020-09-08T13:42:29Z", "%+").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp_formatted("2020-09-08T13:42:29.190855-05:00", "%+").unwrap() - ); - assert_eq!( - 1599590549000000000, - parse_timestamp_formatted("1599590549", "%s").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp_formatted("09-08-2020 13/42/29", "%m-%d-%Y %H/%M/%S") - .unwrap() - ); - } - - fn parse_timestamp_formatted(s: &str, format: &str) -> Result { - let result = string_to_timestamp_nanos_formatted(s, format); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}"); - } - result - } - - #[test] - fn string_to_timestamp_formatted_invalid() { - // Test parsing invalid formats - let cases = [ - ("", "%Y%m%d %H%M%S", "premature end of input"), - ("SS", "%c", "premature end of input"), - ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), - ( - "Wed, 18 Feb 2015 23:16:09 GMT", - "%XX", - "input contains invalid characters", - ), - ( - "Wed, 18 Feb 2015 23:16:09 GMT", - "%Y%m%d %H%M%S", - "input contains invalid characters", - ), - ]; - - for (s, f, ctx) in cases { - let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); - let actual = string_to_datetime_formatted(&Utc, s, f) - .unwrap_err() - .to_string(); - assert_eq!(actual, expected) - } - } - - #[test] - fn string_to_timestamp_invalid_arguments() { - // Test parsing invalid formats - let cases = [ - ("", "%Y%m%d %H%M%S", "premature end of input"), - ("SS", "%c", "premature end of input"), - ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), - ( - "Wed, 18 Feb 2015 23:16:09 GMT", - "%XX", - "input contains invalid characters", - ), - ( - "Wed, 18 Feb 2015 23:16:09 GMT", - "%Y%m%d %H%M%S", - "input contains invalid characters", - ), - ]; - - for (s, f, ctx) in cases { - let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); - let actual = string_to_datetime_formatted(&Utc, s, f) - .unwrap_err() - .to_string(); - assert_eq!(actual, expected) - } - } - - #[test] - fn test_to_timestamp_arg_validation() { - let mut date_string_builder = StringBuilder::with_capacity(2, 1024); - date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); - - let data = date_string_builder.finish(); - - let funcs: Vec<(ScalarFunctionImplementation, TimeUnit)> = vec![ - (Arc::new(to_timestamp), TimeUnit::Nanosecond), - (Arc::new(to_timestamp_micros), TimeUnit::Microsecond), - (Arc::new(to_timestamp_millis), TimeUnit::Millisecond), - (Arc::new(to_timestamp_nanos), TimeUnit::Nanosecond), - (Arc::new(to_timestamp_seconds), TimeUnit::Second), - ]; - - let mut nanos_builder = TimestampNanosecondArray::builder(2); - let mut millis_builder = TimestampMillisecondArray::builder(2); - let mut micros_builder = TimestampMicrosecondArray::builder(2); - let mut sec_builder = TimestampSecondArray::builder(2); - - nanos_builder.append_value(1599572549190850000); - millis_builder.append_value(1599572549190); - micros_builder.append_value(1599572549190850); - sec_builder.append_value(1599572549); - - let nanos_expected_timestamps = &nanos_builder.finish() as &dyn Array; - let millis_expected_timestamps = &millis_builder.finish() as &dyn Array; - let micros_expected_timestamps = µs_builder.finish() as &dyn Array; - let sec_expected_timestamps = &sec_builder.finish() as &dyn Array; - - for (func, time_unit) in funcs { - // test UTF8 - let string_array = [ - ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), - ColumnarValue::Scalar(ScalarValue::Utf8(Some("%s".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8(Some("%c".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8(Some("%+".to_string()))), - ]; - let parsed_timestamps = func(&string_array) - .expect("that to_timestamp with format args parsed values without error"); - if let ColumnarValue::Array(parsed_array) = parsed_timestamps { - assert_eq!(parsed_array.len(), 1); - match time_unit { - TimeUnit::Nanosecond => { - assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Millisecond => { - assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Microsecond => { - assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Second => { - assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) - } - }; - } else { - panic!("Expected a columnar array") - } - - // test LargeUTF8 - let string_array = [ - ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%s".to_string()))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%c".to_string()))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%+".to_string()))), - ]; - let parsed_timestamps = func(&string_array) - .expect("that to_timestamp with format args parsed values without error"); - if let ColumnarValue::Array(parsed_array) = parsed_timestamps { - assert_eq!(parsed_array.len(), 1); - match time_unit { - TimeUnit::Nanosecond => { - assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Millisecond => { - assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Microsecond => { - assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) - } - TimeUnit::Second => { - assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) - } - }; - } else { - panic!("Expected a columnar array") - } - - // test other types - let string_array = [ - ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), - ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(2))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(3))), - ]; - - let expected = "Unsupported data type Int32 for function".to_string(); - let actual = func(&string_array).unwrap_err().to_string(); - assert_contains!(actual, expected); - - // test other types - let string_array = [ - ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), - ColumnarValue::Array(Arc::new(PrimitiveArray::::new( - vec![1i64].into(), - None, - )) as ArrayRef), - ]; - - let expected = "Unsupported data type".to_string(); - let actual = func(&string_array).unwrap_err().to_string(); - assert_contains!(actual, expected); - } - } - #[test] fn test_make_date() { let res = make_date(&[ diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 9125f73048cb..a3bff578cad4 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -24,11 +24,11 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use DataType::*; -use arrow::compute::{can_cast_types, kernels, CastOptions}; +use arrow::compute::{can_cast_types, CastOptions}; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; -use datafusion_common::{not_impl_err, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::ColumnarValue; @@ -120,7 +120,7 @@ impl PhysicalExpr for CastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; - cast_column(&value, &self.cast_type, Some(&self.cast_options)) + value.cast_to(&self.cast_type, Some(&self.cast_options)) } fn children(&self) -> Vec> { @@ -182,43 +182,6 @@ impl PartialEq for CastExpr { } } -/// Internal cast function for casting ColumnarValue -> ColumnarValue for cast_type -pub fn cast_column( - value: &ColumnarValue, - cast_type: &DataType, - cast_options: Option<&CastOptions<'static>>, -) -> Result { - let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS); - match value { - ColumnarValue::Array(array) => Ok(ColumnarValue::Array( - kernels::cast::cast_with_options(array, cast_type, &cast_options)?, - )), - ColumnarValue::Scalar(scalar) => { - let scalar_array = if cast_type - == &DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) - { - if let ScalarValue::Float64(Some(float_ts)) = scalar { - ScalarValue::Int64( - Some((float_ts * 1_000_000_000_f64).trunc() as i64), - ) - .to_array()? - } else { - scalar.to_array()? - } - } else { - scalar.to_array()? - }; - let cast_array = kernels::cast::cast_with_options( - &scalar_array, - cast_type, - &cast_options, - )?; - let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; - Ok(ColumnarValue::Scalar(cast_scalar)) - } - } -} - /// Return a PhysicalExpression representing `expr` casted to /// `cast_type`, if any casting is needed. /// diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index ec20345569c2..f9896bafca15 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -80,7 +80,7 @@ pub use crate::PhysicalSortExpr; pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; -pub use cast::{cast, cast_column, cast_with_options, CastExpr}; +pub use cast::{cast, cast_with_options, CastExpr}; pub use column::{col, Column, UnKnownColumn}; pub use get_indexed_field::{GetFieldAccessExpr, GetIndexedFieldExpr}; pub use in_list::{in_list, InListExpr}; diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 584901d37d28..56ad92082d9f 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -505,25 +505,9 @@ pub fn create_physical_fun( } BuiltinScalarFunction::MakeDate => Arc::new(datetime_expressions::make_date), BuiltinScalarFunction::ToChar => Arc::new(datetime_expressions::to_char), - BuiltinScalarFunction::ToTimestamp => { - Arc::new(datetime_expressions::to_timestamp_invoke) - } - BuiltinScalarFunction::ToTimestampMillis => { - Arc::new(datetime_expressions::to_timestamp_millis_invoke) - } - BuiltinScalarFunction::ToTimestampMicros => { - Arc::new(datetime_expressions::to_timestamp_micros_invoke) - } - BuiltinScalarFunction::ToTimestampNanos => { - Arc::new(datetime_expressions::to_timestamp_nanos_invoke) - } - BuiltinScalarFunction::ToTimestampSeconds => { - Arc::new(datetime_expressions::to_timestamp_seconds_invoke) - } BuiltinScalarFunction::FromUnixtime => { Arc::new(datetime_expressions::from_unixtime_invoke) } - BuiltinScalarFunction::ToDate => Arc::new(datetime_expressions::to_date_invoke), BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::initcap::)(args) @@ -3040,11 +3024,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); // pick some arbitrary functions to test - let funs = [ - BuiltinScalarFunction::Concat, - BuiltinScalarFunction::ToTimestamp, - BuiltinScalarFunction::Repeat, - ]; + let funs = [BuiltinScalarFunction::Concat, BuiltinScalarFunction::Repeat]; for fun in funs.iter() { let expr = create_physical_expr_with_type_coercion( diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 9015d9ee9b00..1ad7a2c3afaf 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -603,10 +603,10 @@ enum ScalarFunction { Strpos = 52; Substr = 53; ToHex = 54; - ToTimestamp = 55; - ToTimestampMillis = 56; - ToTimestampMicros = 57; - ToTimestampSeconds = 58; + // 55 was ToTimestamp + // 56 was ToTimestampMillis + // 57 was ToTimestampMicros + // 58 was ToTimestampSeconds Now = 59; Translate = 60; Trim = 61; @@ -664,7 +664,7 @@ enum ScalarFunction { ArrayEmpty = 115; ArrayPopBack = 116; StringToArray = 117; - ToTimestampNanos = 118; + // 118 was ToTimestampNanos ArrayIntersect = 119; ArrayUnion = 120; OverLay = 121; @@ -683,7 +683,7 @@ enum ScalarFunction { ArrayReverse = 134; RegexpLike = 135; ToChar = 136; - ToDate = 137; + /// 137 was ToDate } message ScalarFunctionNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index f42e362a4f99..33ebdf310ae0 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22372,10 +22372,6 @@ impl serde::Serialize for ScalarFunction { Self::Strpos => "Strpos", Self::Substr => "Substr", Self::ToHex => "ToHex", - Self::ToTimestamp => "ToTimestamp", - Self::ToTimestampMillis => "ToTimestampMillis", - Self::ToTimestampMicros => "ToTimestampMicros", - Self::ToTimestampSeconds => "ToTimestampSeconds", Self::Now => "Now", Self::Translate => "Translate", Self::Trim => "Trim", @@ -22431,7 +22427,6 @@ impl serde::Serialize for ScalarFunction { Self::ArrayEmpty => "ArrayEmpty", Self::ArrayPopBack => "ArrayPopBack", Self::StringToArray => "StringToArray", - Self::ToTimestampNanos => "ToTimestampNanos", Self::ArrayIntersect => "ArrayIntersect", Self::ArrayUnion => "ArrayUnion", Self::OverLay => "OverLay", @@ -22449,7 +22444,6 @@ impl serde::Serialize for ScalarFunction { Self::ArrayReverse => "ArrayReverse", Self::RegexpLike => "RegexpLike", Self::ToChar => "ToChar", - Self::ToDate => "ToDate", }; serializer.serialize_str(variant) } @@ -22512,10 +22506,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos", "Substr", "ToHex", - "ToTimestamp", - "ToTimestampMillis", - "ToTimestampMicros", - "ToTimestampSeconds", "Now", "Translate", "Trim", @@ -22571,7 +22561,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayEmpty", "ArrayPopBack", "StringToArray", - "ToTimestampNanos", "ArrayIntersect", "ArrayUnion", "OverLay", @@ -22589,7 +22578,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayReverse", "RegexpLike", "ToChar", - "ToDate", ]; struct GeneratedVisitor; @@ -22681,10 +22669,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "ToHex" => Ok(ScalarFunction::ToHex), - "ToTimestamp" => Ok(ScalarFunction::ToTimestamp), - "ToTimestampMillis" => Ok(ScalarFunction::ToTimestampMillis), - "ToTimestampMicros" => Ok(ScalarFunction::ToTimestampMicros), - "ToTimestampSeconds" => Ok(ScalarFunction::ToTimestampSeconds), "Now" => Ok(ScalarFunction::Now), "Translate" => Ok(ScalarFunction::Translate), "Trim" => Ok(ScalarFunction::Trim), @@ -22740,7 +22724,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayEmpty" => Ok(ScalarFunction::ArrayEmpty), "ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack), "StringToArray" => Ok(ScalarFunction::StringToArray), - "ToTimestampNanos" => Ok(ScalarFunction::ToTimestampNanos), "ArrayIntersect" => Ok(ScalarFunction::ArrayIntersect), "ArrayUnion" => Ok(ScalarFunction::ArrayUnion), "OverLay" => Ok(ScalarFunction::OverLay), @@ -22758,7 +22741,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayReverse" => Ok(ScalarFunction::ArrayReverse), "RegexpLike" => Ok(ScalarFunction::RegexpLike), "ToChar" => Ok(ScalarFunction::ToChar), - "ToDate" => Ok(ScalarFunction::ToDate), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 97a620bac268..2d21f15570dd 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2690,10 +2690,10 @@ pub enum ScalarFunction { Strpos = 52, Substr = 53, ToHex = 54, - ToTimestamp = 55, - ToTimestampMillis = 56, - ToTimestampMicros = 57, - ToTimestampSeconds = 58, + /// 55 was ToTimestamp + /// 56 was ToTimestampMillis + /// 57 was ToTimestampMicros + /// 58 was ToTimestampSeconds Now = 59, Translate = 60, Trim = 61, @@ -2751,7 +2751,7 @@ pub enum ScalarFunction { ArrayEmpty = 115, ArrayPopBack = 116, StringToArray = 117, - ToTimestampNanos = 118, + /// 118 was ToTimestampNanos ArrayIntersect = 119, ArrayUnion = 120, OverLay = 121, @@ -2769,8 +2769,8 @@ pub enum ScalarFunction { MakeDate = 133, ArrayReverse = 134, RegexpLike = 135, + /// / 137 was ToDate ToChar = 136, - ToDate = 137, } impl ScalarFunction { /// String value of the enum field names used in the ProtoBuf definition. @@ -2830,10 +2830,6 @@ impl ScalarFunction { ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::ToHex => "ToHex", - ScalarFunction::ToTimestamp => "ToTimestamp", - ScalarFunction::ToTimestampMillis => "ToTimestampMillis", - ScalarFunction::ToTimestampMicros => "ToTimestampMicros", - ScalarFunction::ToTimestampSeconds => "ToTimestampSeconds", ScalarFunction::Now => "Now", ScalarFunction::Translate => "Translate", ScalarFunction::Trim => "Trim", @@ -2889,7 +2885,6 @@ impl ScalarFunction { ScalarFunction::ArrayEmpty => "ArrayEmpty", ScalarFunction::ArrayPopBack => "ArrayPopBack", ScalarFunction::StringToArray => "StringToArray", - ScalarFunction::ToTimestampNanos => "ToTimestampNanos", ScalarFunction::ArrayIntersect => "ArrayIntersect", ScalarFunction::ArrayUnion => "ArrayUnion", ScalarFunction::OverLay => "OverLay", @@ -2907,7 +2902,6 @@ impl ScalarFunction { ScalarFunction::ArrayReverse => "ArrayReverse", ScalarFunction::RegexpLike => "RegexpLike", ScalarFunction::ToChar => "ToChar", - ScalarFunction::ToDate => "ToDate", } } /// Creates an enum from field names used in the ProtoBuf definition. @@ -2964,10 +2958,6 @@ impl ScalarFunction { "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "ToHex" => Some(Self::ToHex), - "ToTimestamp" => Some(Self::ToTimestamp), - "ToTimestampMillis" => Some(Self::ToTimestampMillis), - "ToTimestampMicros" => Some(Self::ToTimestampMicros), - "ToTimestampSeconds" => Some(Self::ToTimestampSeconds), "Now" => Some(Self::Now), "Translate" => Some(Self::Translate), "Trim" => Some(Self::Trim), @@ -3023,7 +3013,6 @@ impl ScalarFunction { "ArrayEmpty" => Some(Self::ArrayEmpty), "ArrayPopBack" => Some(Self::ArrayPopBack), "StringToArray" => Some(Self::StringToArray), - "ToTimestampNanos" => Some(Self::ToTimestampNanos), "ArrayIntersect" => Some(Self::ArrayIntersect), "ArrayUnion" => Some(Self::ArrayUnion), "OverLay" => Some(Self::OverLay), @@ -3041,7 +3030,6 @@ impl ScalarFunction { "ArrayReverse" => Some(Self::ArrayReverse), "RegexpLike" => Some(Self::RegexpLike), "ToChar" => Some(Self::ToChar), - "ToDate" => Some(Self::ToDate), _ => None, } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index b2f7355765d2..ab7065cfbd85 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -546,11 +546,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Substr => Self::Substr, ScalarFunction::ToHex => Self::ToHex, ScalarFunction::ToChar => Self::ToChar, - ScalarFunction::ToTimestamp => Self::ToTimestamp, - ScalarFunction::ToTimestampMillis => Self::ToTimestampMillis, - ScalarFunction::ToTimestampMicros => Self::ToTimestampMicros, - ScalarFunction::ToTimestampNanos => Self::ToTimestampNanos, - ScalarFunction::ToTimestampSeconds => Self::ToTimestampSeconds, ScalarFunction::Now => Self::Now, ScalarFunction::CurrentDate => Self::CurrentDate, ScalarFunction::CurrentTime => Self::CurrentTime, @@ -570,7 +565,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Levenshtein => Self::Levenshtein, ScalarFunction::SubstrIndex => Self::SubstrIndex, ScalarFunction::FindInSet => Self::FindInSet, - ScalarFunction::ToDate => Self::ToDate, } } } @@ -1698,56 +1692,6 @@ pub fn parse_expr( args, ))) } - ScalarFunction::ToTimestamp => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestamp, - args, - ))) - } - ScalarFunction::ToTimestampMillis => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestampMillis, - args, - ))) - } - ScalarFunction::ToTimestampMicros => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestampMicros, - args, - ))) - } - ScalarFunction::ToTimestampNanos => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestampNanos, - args, - ))) - } - ScalarFunction::ToTimestampSeconds => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestampSeconds, - args, - ))) - } ScalarFunction::Now => Ok(now()), ScalarFunction::Translate => Ok(translate( parse_expr(&args[0], registry)?, @@ -1811,16 +1755,6 @@ pub fn parse_expr( ScalarFunction::StructFun => { Ok(struct_fun(parse_expr(&args[0], registry)?)) } - ScalarFunction::ToDate => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToDate, - args, - ))) - } } } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { fun_name, args }) => { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 2aa149f1606c..c913119ff9ed 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1456,7 +1456,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Ltrim => Self::Ltrim, BuiltinScalarFunction::Rtrim => Self::Rtrim, BuiltinScalarFunction::ToChar => Self::ToChar, - BuiltinScalarFunction::ToTimestamp => Self::ToTimestamp, BuiltinScalarFunction::ArrayAppend => Self::ArrayAppend, BuiltinScalarFunction::ArraySort => Self::ArraySort, BuiltinScalarFunction::ArrayConcat => Self::ArrayConcat, @@ -1499,7 +1498,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::SHA384 => Self::Sha384, BuiltinScalarFunction::SHA512 => Self::Sha512, BuiltinScalarFunction::Digest => Self::Digest, - BuiltinScalarFunction::ToTimestampMillis => Self::ToTimestampMillis, BuiltinScalarFunction::Log2 => Self::Log2, BuiltinScalarFunction::Signum => Self::Signum, BuiltinScalarFunction::Ascii => Self::Ascii, @@ -1528,9 +1526,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::ToHex => Self::ToHex, - BuiltinScalarFunction::ToTimestampMicros => Self::ToTimestampMicros, - BuiltinScalarFunction::ToTimestampNanos => Self::ToTimestampNanos, - BuiltinScalarFunction::ToTimestampSeconds => Self::ToTimestampSeconds, BuiltinScalarFunction::Now => Self::Now, BuiltinScalarFunction::CurrentDate => Self::CurrentDate, BuiltinScalarFunction::CurrentTime => Self::CurrentTime, @@ -1549,7 +1544,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Levenshtein => Self::Levenshtein, BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, BuiltinScalarFunction::FindInSet => Self::FindInSet, - BuiltinScalarFunction::ToDate => Self::ToDate, }; Ok(scalar_function) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index dc5e96d2d00d..913cfbafb6ed 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -483,7 +483,7 @@ statement error Did you mean 'arrow_typeof'? SELECT arrowtypeof(v1) from test; # Scalar function -statement error Did you mean 'to_timestamp_seconds'? +statement error Invalid function 'to_timestamps_second' SELECT to_TIMESTAMPS_second(v2) from test; # Aggregate function