-
Notifications
You must be signed in to change notification settings - Fork 241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support date_format via Gpu for non-UTC time zone [databricks] #9721
Changes from all commits
7625e16
6c961fb
4c6133e
92741c6
877130a
e3941c2
1538d82
4eaa4dd
7e61915
96924eb
f1314e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -449,12 +449,32 @@ def test_gettimestamp_ansi_exception(): | |
supported_date_formats = ['yyyy-MM-dd', 'yyyy-MM', 'yyyy/MM/dd', 'yyyy/MM', 'dd/MM/yyyy', | ||
'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM'] | ||
@pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) | ||
@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) | ||
@allow_non_gpu(*non_utc_allow) | ||
def test_date_format(data_gen, date_format): | ||
@pytest.mark.parametrize('data_gen', [date_gen], ids=idfn) | ||
@allow_non_gpu('ProjectExec') | ||
def test_date_format_for_date(data_gen, date_format): | ||
assert_gpu_and_cpu_are_equal_collect( | ||
lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) | ||
|
||
@pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) | ||
@pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) | ||
@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") | ||
def test_date_format_for_time(data_gen, date_format): | ||
conf = {'spark.rapids.sql.nonUTC.enabled': True} | ||
assert_gpu_and_cpu_are_equal_collect( | ||
lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format)), | ||
conf) | ||
|
||
@pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) | ||
@pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) | ||
@pytest.mark.skipif(is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") | ||
@allow_non_gpu('ProjectExec') | ||
def test_date_format_for_time_fall_back(data_gen, date_format): | ||
conf = {'spark.rapids.sql.nonUTC.enabled': True} | ||
assert_gpu_fallback_collect( | ||
lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format)), | ||
'ProjectExec', | ||
conf) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: you can merge https://github.com/NVIDIA/spark-rapids/pull/9721/files#diff-ec41a643cf16a9946f6df2498c6d853812f526eb0a3f8d3299b4a911346e1703R491 here directly without creating conf variable. |
||
|
||
@pytest.mark.parametrize('date_format', supported_date_formats + ['yyyyMMdd'], ids=idfn) | ||
# from 0001-02-01 to 9999-12-30 to avoid 'year 0 is out of range' | ||
@pytest.mark.parametrize('data_gen', [LongGen(min_val=int(datetime(1, 2, 1).timestamp()), max_val=int(datetime(9999, 12, 30).timestamp()))], ids=idfn) | ||
|
@@ -576,7 +596,7 @@ def test_timestamp_seconds_long_overflow(): | |
error_message='long overflow') | ||
|
||
# For Decimal(20, 7) case, the data is both 'Overflow' and 'Rounding necessary', this case is to verify | ||
# that 'Rounding necessary' check is before 'Overflow' check. So we should make sure that every decimal | ||
# that 'Rounding necessary' check is before 'Overflow' check. So we should make sure that every decimal | ||
# value in test data is 'Rounding necessary' by setting full_precision=True to avoid leading and trailing zeros. | ||
# Otherwise, the test data will bypass the 'Rounding necessary' check and throw an 'Overflow' error. | ||
@pytest.mark.parametrize('data_gen', [DecimalGen(7, 7, full_precision=True), DecimalGen(20, 7, full_precision=True)], ids=idfn) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1681,8 +1681,9 @@ object GpuOverrides extends Logging { | |
.withPsNote(TypeEnum.STRING, "A limited number of formats are supported"), | ||
TypeSig.STRING)), | ||
(a, conf, p, r) => new UnixTimeExprMeta[DateFormatClass](a, conf, p, r) { | ||
override def isTimeZoneSupported = true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is due to the fact that DateFormatClass is not considered a TimeZoneAwareExpression, but requires support for non-UTC timezones. It's the last check for timezone requirement in Expressions. By default, this flag is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems not a good practice to set some value as |
||
override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: code style. not sure whether a line break need here. |
||
GpuDateFormatClass(lhs, rhs, strfFormat) | ||
GpuDateFormatClass(lhs, rhs, strfFormat, a.timeZoneId) | ||
} | ||
), | ||
expr[ToUnixTimestamp]( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to check whether it's supported timezone?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
date_format(date) will introduce cast(date as timestamp) which is not supported in non-UTC now.
After we support cast(date as timestamp), then we will update this case.