-
Notifications
You must be signed in to change notification settings - Fork 240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add date and timestamp support to to_json [databricks] #9600
Changes from 15 commits
0989865
d4541b5
9e982b3
e7a1c13
541c6f7
09bdcaf
a14c5bf
3ffa025
08e3101
ca8c926
02db97d
40f1085
ead3fd9
b69f257
55ae3f1
fb6f8a9
f4bddca
cd8bc7a
881b75f
2c3c4ec
9fb7166
db9582b
95ccce3
635b3ef
c4440ed
13eb70c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -612,8 +612,8 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): | |
long_gen, | ||
pytest.param(float_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')), | ||
pytest.param(double_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9350')), | ||
pytest.param(date_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), | ||
pytest.param(timestamp_gen, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9515')), | ||
date_gen, | ||
timestamp_gen, | ||
StringGen('[A-Za-z0-9\r\n\'"\\\\]{0,10}', nullable=True) \ | ||
.with_special_case('\u1f600') \ | ||
.with_special_case('"a"') \ | ||
|
@@ -628,7 +628,8 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): | |
pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')), | ||
False | ||
]) | ||
def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty): | ||
@pytest.mark.parametrize('timezone', ['UTC', 'Etc/UTC']) | ||
def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty, timezone): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a fallback test for non-utc? |
||
struct_gen = StructGen([ | ||
('a', data_gen), | ||
("b", StructGen([('child', data_gen)], nullable=True)), | ||
|
@@ -640,7 +641,8 @@ def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty): | |
gen = StructGen([('my_struct', struct_gen)], nullable=False) | ||
|
||
options = { 'ignoreNullFields': ignore_null_fields, | ||
'pretty': pretty } | ||
'pretty': pretty, | ||
'timeZone': timezone} | ||
|
||
def struct_to_json(spark): | ||
df = gen_df(spark, gen) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3598,15 +3598,47 @@ object GpuOverrides extends Logging { | |
TypeSig.STRING, | ||
Seq(ParamCheck("struct", | ||
(TypeSig.BOOLEAN + TypeSig.STRING + TypeSig.integral + TypeSig.FLOAT + | ||
TypeSig.DOUBLE + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(), | ||
TypeSig.DOUBLE + TypeSig.DATE + TypeSig.TIMESTAMP + | ||
TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(), | ||
(TypeSig.BOOLEAN + TypeSig.STRING + TypeSig.integral + TypeSig.FLOAT + | ||
TypeSig.DOUBLE + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested() | ||
TypeSig.DOUBLE + TypeSig.DATE + TypeSig.TIMESTAMP + | ||
TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested() | ||
))), | ||
(a, conf, p, r) => new UnaryExprMeta[StructsToJson](a, conf, p, r) { | ||
override def tagExprForGpu(): Unit = { | ||
if (a.options.get("pretty").exists(_.equalsIgnoreCase("true"))) { | ||
willNotWorkOnGpu("to_json option pretty=true is not supported") | ||
} | ||
val hasDates = TrampolineUtil.dataTypeExistsRecursively(a.child.dataType, | ||
_.isInstanceOf[DateType]) | ||
if (hasDates) { | ||
// check if the default format is being used | ||
val defaultFormat = "yyyy-MM-dd" | ||
val dateFormat = a.options.getOrElse("dateFormat", defaultFormat) | ||
if (dateFormat != defaultFormat) { | ||
// we can likely support other formats but we would need to add tests | ||
// tracking issue is https://github.com/NVIDIA/spark-rapids/issues/9602 | ||
willNotWorkOnGpu(s"Unsupported dateFormat '$dateFormat' in to_json") | ||
} | ||
} | ||
val hasTimestamps = TrampolineUtil.dataTypeExistsRecursively(a.child.dataType, | ||
_.isInstanceOf[TimestampType]) | ||
if (hasTimestamps) { | ||
// check if the default format is being used | ||
val defaultFormat = "yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]" | ||
val timestampFormat = a.options.getOrElse("timestampFormat", defaultFormat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this does 100% of what we want include a check for the legacyTimeParserPolicy. Either we need to use |
||
if (timestampFormat != defaultFormat) { | ||
// we can likely support other formats but we would need to add tests | ||
// tracking issue is https://github.com/NVIDIA/spark-rapids/issues/9602 | ||
willNotWorkOnGpu(s"Unsupported timestampFormat '$timestampFormat' in to_json") | ||
} | ||
val timeZone = a.options.getOrElse("timeZone", SQLConf.get.sessionLocalTimeZone) | ||
if (timeZone != "UTC" && timeZone != "Etc/UTC") { | ||
// we hard-code the timezone `Z` in GpuCast.castTimestampToJson | ||
// so we need to fall back if a different timeZone is specified | ||
willNotWorkOnGpu(s"Unsupported timeZone '$timeZone' in to_json") | ||
} | ||
} | ||
} | ||
|
||
override def convertToGpu(child: Expression): GpuExpression = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add some fallback tests for the options timestampFormat, dateFormat, and if "spark.sql.legacy.timeParserPolicy" is set to LEGACY? At least from the code all of these impact the format that is used to write the JSON output.