-
Notifications
You must be signed in to change notification settings - Fork 242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix from_json
function failure when input contains empty or null strings
#8526
Changes from all commits
1807edf
f1092fb
2f7e2ad
5674788
b07674d
f6583c5
9a003c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -33,12 +33,40 @@ case class GpuJsonToStructs( | |||||||||
timeZoneId: Option[String] = None) | ||||||||||
extends GpuUnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes | ||||||||||
with NullIntolerant { | ||||||||||
|
||||||||||
private def constructEmptyRow(schema: DataType): String = { | ||||||||||
schema match { | ||||||||||
case struct: StructType if (struct.fields.length > 0) => { | ||||||||||
val jsonFields: Array[String] = struct.fields.map { field => | ||||||||||
field.dataType match { | ||||||||||
case IntegerType => s""""${field.name}": 0""" | ||||||||||
case StringType => s""""${field.name}": """"" | ||||||||||
case s: StructType => s""""${field.name}": ${constructEmptyRow(s)}""" | ||||||||||
case a: ArrayType => s""""${field.name}": ${constructEmptyRow(a)}""" | ||||||||||
case t => throw new IllegalArgumentException("GpuJsonToStructs currently" + | ||||||||||
s"does not support input schema with type $t.") | ||||||||||
} | ||||||||||
} | ||||||||||
jsonFields.mkString("{", ", ", "}") | ||||||||||
} | ||||||||||
case array: ArrayType => s"[${constructEmptyRow(array.elementType)}]" | ||||||||||
case _ => "{}" | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
lazy val emptyRowStr = constructEmptyRow(schema) | ||||||||||
|
||||||||||
private def cleanAndConcat(input: cudf.ColumnVector): (cudf.ColumnVector, cudf.ColumnVector) ={ | ||||||||||
withResource(cudf.Scalar.fromString("{}")) { emptyRow => | ||||||||||
val stripped = withResource(cudf.Scalar.fromString(" ")) { space => | ||||||||||
input.strip(space) | ||||||||||
private def cleanAndConcat(input: cudf.ColumnVector): (cudf.ColumnVector, cudf.ColumnVector) = { | ||||||||||
withResource(cudf.Scalar.fromString(emptyRowStr)) { emptyRow => | ||||||||||
|
||||||||||
val stripped = if (input.getData == null) { | ||||||||||
input.incRefCount | ||||||||||
} else { | ||||||||||
withResource(cudf.Scalar.fromString(" ")) { space => | ||||||||||
input.strip(space) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we look for other places that strip, rstrip, and lstrip are called? I did a quick look and I think we might get the same problem on
spark-rapids/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala Line 306 in e3b9517
spark-rapids/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala Line 286 in e3b9517
But there may be others. At a minimum we need to test these too and fix them if they are also broken There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @revans2 Thanks! I tested
I didn't find tests for the above rstrip() call site, and haven't added a fix yet.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that is fine. That is specifically for a special case ORC char type. We should file an issue for it just so it is not missed, but it should be really rare. |
||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
withResource(stripped) { stripped => | ||||||||||
val isNullOrEmptyInput = withResource(input.isNull) { isNull => | ||||||||||
val isEmpty = withResource(stripped.getCharLengths) { lengths => | ||||||||||
|
@@ -86,14 +114,14 @@ case class GpuJsonToStructs( | |||||||||
// Output = [(null, StringType), ("b", StringType), ("a", IntegerType)] | ||||||||||
private def processFieldNames(names: Seq[(String, DataType)]): Seq[(String, DataType)] = { | ||||||||||
val zero = (Set.empty[String], Seq.empty[(String, DataType)]) | ||||||||||
val (_, res) = names.foldRight(zero) { case ((name, dtype), (existingNames, acc)) => | ||||||||||
val (_, resultFields) = names.foldRight (zero) { case ((name, dtype), (existingNames, acc)) => | ||||||||||
if (existingNames(name)) { | ||||||||||
(existingNames, (null, dtype) +: acc) | ||||||||||
} else { | ||||||||||
(existingNames + name, (name, dtype) +: acc) | ||||||||||
} | ||||||||||
} | ||||||||||
res | ||||||||||
resultFields | ||||||||||
} | ||||||||||
|
||||||||||
// Given a cudf column, return its Spark type | ||||||||||
|
@@ -151,8 +179,7 @@ case class GpuJsonToStructs( | |||||||||
} | ||||||||||
|
||||||||||
// process duplicated field names in input struct schema | ||||||||||
val fieldNames = processFieldNames(struct.fields.map { field => | ||||||||||
(field.name, field.dataType)}) | ||||||||||
val fieldNames = processFieldNames(struct.fields.map (f => (f.name, f.dataType))) | ||||||||||
|
||||||||||
withResource(rawTable) { rawTable => | ||||||||||
// Step 5: verify that the data looks correct | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, does Spark really return a 0 and not a null for an empty int column???
What happens if a field name has odd characters in it? Can it have a " for example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not certain about Spark side. I will investigate more. For odd characters, I think it would fail at
readJSON
parsing step. I will follow up with this in #8542. Thank you so much!