From c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 26 Apr 2024 04:02:25 -0500 Subject: [PATCH] Fix a JNI bug in JSON parsing fixup (#15550) When parsing JSON in the current code if no columns can be parsed out of the data, then an empty table is returned. Earlier we put in a work around to this so that we could pass in the number of rows needed and the JSON parsing code would make a table of null values for it. This had some issues with structs and lists which needed an extended way to produce the null scalar. This adds in code to do just that. Authors: - Robert (Bobby) Evans (https://github.com/revans2) - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/15550 --- java/src/main/java/ai/rapids/cudf/Schema.java | 28 ++++++++++++++++++- java/src/main/java/ai/rapids/cudf/Table.java | 22 +++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java index c8571dd841c..43603386649 100644 --- a/java/src/main/java/ai/rapids/cudf/Schema.java +++ b/java/src/main/java/ai/rapids/cudf/Schema.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; /** * The schema of data to be read in. @@ -221,6 +222,13 @@ public DType[] getChildTypes() { return ret; } + public int getNumChildren() { + if (childSchemas == null) { + return 0; + } + return childSchemas.size(); + } + int[] getFlattenedNumChildren() { flattenIfNeeded(); return flattenedCounts; @@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() { return false; } - public static class Builder { + public HostColumnVector.DataType asHostDataType() { + if (topLevelType == DType.LIST) { + assert(childSchemas != null && childSchemas.size() == 1); + HostColumnVector.DataType element = childSchemas.get(0).asHostDataType(); + return new HostColumnVector.ListType(true, element); + } else if (topLevelType == DType.STRUCT) { + if (childSchemas == null) { + return new HostColumnVector.StructType(true); + } else { + List childTypes = + childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList()); + return new HostColumnVector.StructType(true, childTypes); + } + } else { + return new HostColumnVector.BasicType(true, topLevelType); + } + } + + public static class Builder { private final DType topLevelType; private final List names; private final List types; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4038b3a40b8..4e737451ed6 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp columns[i] = tbl.getColumn(index).incRefCount(); } } else { - try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, rowCount); + if (types[i] == DType.LIST) { + Schema listSchema = schema.getChild(i); + Schema elementSchema = listSchema.getChild(0); + try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } + } else if (types[i] == DType.STRUCT) { + Schema structSchema = schema.getChild(i); + int numStructChildren = structSchema.getNumChildren(); + DataType[] structChildrenTypes = new DataType[numStructChildren]; + for (int j = 0; j < numStructChildren; j++) { + structChildrenTypes[j] = structSchema.getChild(j).asHostDataType(); + } + try (Scalar s = Scalar.structFromNull(structChildrenTypes)) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } + } else { + try (Scalar s = Scalar.fromNull(types[i])) { + columns[i] = ColumnVector.fromScalar(s, rowCount); + } } } }