From 88be4d24ddd880ac8e457ec13358440c99381aaa Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Wed, 1 Nov 2023 15:43:59 +0800 Subject: [PATCH] fix(expr): `date_trunc` and `+/-` on `timestamptz` during DST jump-back --- e2e_test/batch/functions/issue_12072.slt.part | 129 ++++++++++++++++++ src/expr/impl/src/scalar/date_trunc.rs | 96 ++++++++----- src/expr/impl/src/scalar/timestamptz.rs | 19 ++- 3 files changed, 206 insertions(+), 38 deletions(-) create mode 100644 e2e_test/batch/functions/issue_12072.slt.part diff --git a/e2e_test/batch/functions/issue_12072.slt.part b/e2e_test/batch/functions/issue_12072.slt.part new file mode 100644 index 0000000000000..3669a9c007136 --- /dev/null +++ b/e2e_test/batch/functions/issue_12072.slt.part @@ -0,0 +1,129 @@ +# Some exprs on `timestamptz` are implemented by a chain of 3 steps: +# * convert `timestamptz` into naive `timestamp` in the given zone +# * perform actual operation on the naive `timestamp` +# * convert naive `timestamp` in the given zone back to `timestamptz` +# +# This can lead to unintuitive results, because the first and last steps are not perfect inverse. + +# Two different `timestamptz` values collapse into the same naive `timestamp`. +query T +select + '2023-11-05 08:40:00Z'::timestamptz AT TIME ZONE 'US/Pacific', + '2023-11-05 09:40:00Z'::timestamptz AT TIME ZONE 'US/Pacific'; +---- +2023-11-05 01:40:00 2023-11-05 01:40:00 + +# When ambiguous, the `timestamptz` after a jump-back transition is returned. +query T +select '2023-11-05 01:40:00'::timestamp AT TIME ZONE 'US/Pacific'; +---- +2023-11-05 09:40:00+00:00 + +# Tests below are about exprs following the 3-step pattern: +# * `date_trunc(field, timestamptz, zone)` -> `timestamptz` +# * `timestamptz + interval` -> `timestamptz` +# * Also used by `interval + timestamptz` and `timestamptz - interval` + +# (A) intuitive cases without anomalies observed (but require special care behind the scenes) + +query R +with t(id, v) as (values + ('a', '2023-11-05 01:40:00-07:00'::timestamptz), + ('b', '2023-11-05 01:40:00-08:00'::timestamptz)) +select extract(epoch from date_trunc('hour', v, 'US/Pacific')) from t order by id; +---- +1699171200.000000 +1699174800.000000 + +statement ok +set timezone = 'US/Pacific'; + +query T +with t(id, v) as (values + ('a', '2023-11-05 01:40:00-07:00'::timestamptz), + ('b', '2023-11-05 01:40:00-08:00'::timestamptz)) +select v + interval '5' minute from t order by id; +---- +2023-11-05 01:45:00-07:00 +2023-11-05 01:45:00-08:00 + +statement ok +set timezone = 'UTC'; + +# (B) common 1-hour jump-back for whole-hour zones + +query R +with t(id, v) as (values + ('a', '2023-11-05 01:40:00-07:00'::timestamptz), + ('b', '2023-11-05 01:40:00-08:00'::timestamptz)) +select extract(epoch from date_trunc('day', v, 'US/Pacific')) from t order by id; +---- +1699167600.000000 +1699167600.000000 + +statement ok +set timezone = 'US/Pacific'; + +query T +with t(id, v) as (values + ('a', '2023-11-05 01:40:00-07:00'::timestamptz), + ('b', '2023-11-05 01:40:00-08:00'::timestamptz), + ('c', '2023-11-04 01:40:00-07:00'::timestamptz)) +select v + interval '1' day from t order by id; +---- +2023-11-06 01:40:00-08:00 +2023-11-06 01:40:00-08:00 +2023-11-05 01:40:00-08:00 + +statement ok +set timezone = 'UTC'; + +# (C) 1-hour jump-back for half-hour zones + +query T +with t(id, v) as (values + ('a', '2023-04-01 15:58:00Z'::timestamptz), + ('b', '2023-04-01 16:58:00Z'::timestamptz)) +select date_trunc('hour', v, 'Australia/South') from t order by id; +---- +2023-04-01 15:30:00+00:00 +2023-04-01 16:30:00+00:00 + +# (D) half-hour jump-back +# Note that `01:45:00+10:30` is truncated to non-existent `01:00:00+10:30` whose canonical form is `01:30:00+11:00` + +statement ok +set timezone = 'Australia/Lord_Howe'; + +query TTT +with t(id, v) as (values + ('a', '2023-04-01 14:45:00Z'::timestamptz), + ('b', '2023-04-01 15:15:00Z'::timestamptz)) +select date_trunc('hour', v, 'Australia/Lord_Howe'), 'truncated from', v from t order by id; +---- +2023-04-02 01:00:00+11:00 truncated from 2023-04-02 01:45:00+11:00 +2023-04-02 01:30:00+11:00 truncated from 2023-04-02 01:45:00+10:30 + +statement ok +set timezone = 'UTC'; + +# (E) jump-back to day boundary, making it ambiguous +# Due to the disambiguation rule selecting post-transition value, +# `date_trunc` effectively returns an instant in the future, rather than the past. + +query T +with t(id, v) as (values + ('a', '2023-11-05 04:40:00Z'::timestamptz), + ('b', '2023-11-05 05:40:00Z'::timestamptz)) +select date_trunc('day', v, 'America/Havana') from t order by id; +---- +2023-11-05 05:00:00+00:00 +2023-11-05 05:00:00+00:00 + +# (F) jump-forward from day boundary, making it invalid +# Here `2023-09-02 23:59:59-04:00` is followed by `2023-09-03 01:00:00-03:00`. +# There is no `2023-09-03 00:00:00-04:00` or `2023-09-03 00:00:00-03:00`. +# PostgreSQL returns `2023-09-03 01:00:00-03:00` but it is hard using `chrono` crate. + +statement error interpret +select date_trunc('day', '2023-09-03 12:00:00Z'::timestamptz, 'America/Santiago'); diff --git a/src/expr/impl/src/scalar/date_trunc.rs b/src/expr/impl/src/scalar/date_trunc.rs index ffc24fbfc8137..35ba48631bfc4 100644 --- a/src/expr/impl/src/scalar/date_trunc.rs +++ b/src/expr/impl/src/scalar/date_trunc.rs @@ -16,24 +16,39 @@ use risingwave_common::types::{Interval, Timestamp, Timestamptz}; use risingwave_expr::expr::BoxedExpression; use risingwave_expr::{build_function, function, ExprError, Result}; -use super::timestamptz::{timestamp_at_time_zone, timestamptz_at_time_zone}; +use super::timestamptz::timestamp_at_time_zone; + +// TODO(xiangjinwu): parse into an enum +const MICROSECONDS: &str = "microseconds"; +const MILLISECONDS: &str = "milliseconds"; +const SECOND: &str = "second"; +const MINUTE: &str = "minute"; +const HOUR: &str = "hour"; +const DAY: &str = "day"; +const WEEK: &str = "week"; +const MONTH: &str = "month"; +const QUARTER: &str = "quarter"; +const YEAR: &str = "year"; +const DECADE: &str = "decade"; +const CENTURY: &str = "century"; +const MILLENNIUM: &str = "millennium"; #[function("date_trunc(varchar, timestamp) -> timestamp")] pub fn date_trunc_timestamp(field: &str, ts: Timestamp) -> Result { Ok(match field.to_ascii_lowercase().as_str() { - "microseconds" => ts.truncate_micros(), - "milliseconds" => ts.truncate_millis(), - "second" => ts.truncate_second(), - "minute" => ts.truncate_minute(), - "hour" => ts.truncate_hour(), - "day" => ts.truncate_day(), - "week" => ts.truncate_week(), - "month" => ts.truncate_month(), - "quarter" => ts.truncate_quarter(), - "year" => ts.truncate_year(), - "decade" => ts.truncate_decade(), - "century" => ts.truncate_century(), - "millennium" => ts.truncate_millennium(), + MICROSECONDS => ts.truncate_micros(), + MILLISECONDS => ts.truncate_millis(), + SECOND => ts.truncate_second(), + MINUTE => ts.truncate_minute(), + HOUR => ts.truncate_hour(), + DAY => ts.truncate_day(), + WEEK => ts.truncate_week(), + MONTH => ts.truncate_month(), + QUARTER => ts.truncate_quarter(), + YEAR => ts.truncate_year(), + DECADE => ts.truncate_decade(), + CENTURY => ts.truncate_century(), + MILLENNIUM => ts.truncate_millennium(), _ => return Err(invalid_field_error(field)), }) } @@ -52,33 +67,52 @@ fn build_date_trunc_timestamptz_implicit_zone( #[function("date_trunc(varchar, timestamptz, varchar) -> timestamptz")] pub fn date_trunc_timestamptz_at_timezone( field: &str, - ts: Timestamptz, + tsz: Timestamptz, timezone: &str, ) -> Result { - let timestamp = timestamptz_at_time_zone(ts, timezone)?; - let truncated = date_trunc_timestamp(field, timestamp)?; - timestamp_at_time_zone(truncated, timezone) + use chrono::Offset as _; + + use super::timestamptz::time_zone_err; + + let tz = Timestamptz::lookup_time_zone(timezone).map_err(time_zone_err)?; + let instant_local = tsz.to_datetime_in_zone(tz); + + let truncated_naive = date_trunc_timestamp(field, Timestamp(instant_local.naive_local()))?; + + match field.to_ascii_lowercase().as_str() { + MICROSECONDS | MILLISECONDS | SECOND | MINUTE | HOUR => { + // When unit < day, follow PostgreSQL to use old timezone offset. + // rather than reinterpret it in the timezone. + // https://github.com/postgres/postgres/blob/REL_16_0/src/backend/utils/adt/timestamp.c#L4270 + // See `e2e_test/batch/functions/issue_12072.slt.part` for the difference. + let fixed = instant_local.offset().fix(); + // `unwrap` is okay because `FixedOffset` always returns single unique conversion result. + let truncated_local = truncated_naive.0.and_local_timezone(fixed).unwrap(); + Ok(Timestamptz::from_micros(truncated_local.timestamp_micros())) + } + _ => timestamp_at_time_zone(truncated_naive, timezone), + } } #[function("date_trunc(varchar, interval) -> interval")] pub fn date_trunc_interval(field: &str, interval: Interval) -> Result { Ok(match field.to_ascii_lowercase().as_str() { - "microseconds" => interval, - "milliseconds" => interval.truncate_millis(), - "second" => interval.truncate_second(), - "minute" => interval.truncate_minute(), - "hour" => interval.truncate_hour(), - "day" => interval.truncate_day(), - "week" => return Err(ExprError::UnsupportedFunction( + MICROSECONDS => interval, + MILLISECONDS => interval.truncate_millis(), + SECOND => interval.truncate_second(), + MINUTE => interval.truncate_minute(), + HOUR => interval.truncate_hour(), + DAY => interval.truncate_day(), + WEEK => return Err(ExprError::UnsupportedFunction( "interval units \"week\" not supported because months usually have fractional weeks" .into(), )), - "month" => interval.truncate_month(), - "quarter" => interval.truncate_quarter(), - "year" => interval.truncate_year(), - "decade" => interval.truncate_decade(), - "century" => interval.truncate_century(), - "millennium" => interval.truncate_millennium(), + MONTH => interval.truncate_month(), + QUARTER => interval.truncate_quarter(), + YEAR => interval.truncate_year(), + DECADE => interval.truncate_decade(), + CENTURY => interval.truncate_century(), + MILLENNIUM => interval.truncate_millennium(), _ => return Err(invalid_field_error(field)), }) } diff --git a/src/expr/impl/src/scalar/timestamptz.rs b/src/expr/impl/src/scalar/timestamptz.rs index a14eb0a36319b..002b0c773966e 100644 --- a/src/expr/impl/src/scalar/timestamptz.rs +++ b/src/expr/impl/src/scalar/timestamptz.rs @@ -133,19 +133,24 @@ pub fn timestamptz_interval_add( interval: Interval, time_zone: &str, ) -> Result { + use num_traits::Zero as _; + // A month may have 28-31 days, a day may have 23 or 25 hours during Daylight Saving switch. // So their interpretation depends on the local time of a specific zone. let qualitative = interval.truncate_day(); // Units smaller than `day` are zone agnostic. let quantitative = interval - qualitative; - // Note the current implementation simply follows the frontend-rewrite chains of exprs before - // refactor. There may be chances to improve. - let t = timestamptz_at_time_zone(input, time_zone)?; - let t = t - .checked_add(qualitative) - .ok_or(ExprError::NumericOverflow)?; - let t = timestamp_at_time_zone(t, time_zone)?; + let mut t = input; + if !qualitative.is_zero() { + // Only convert into and from naive local when necessary because it is lossy. + // See `e2e_test/batch/functions/issue_12072.slt.part` for the difference. + let naive = timestamptz_at_time_zone(t, time_zone)?; + let naive = naive + .checked_add(qualitative) + .ok_or(ExprError::NumericOverflow)?; + t = timestamp_at_time_zone(naive, time_zone)?; + } let t = timestamptz_interval_quantitative(t, quantitative, i64::checked_add)?; Ok(t) }