-
Notifications
You must be signed in to change notification settings - Fork 3.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix json inputs for drill windowing tests #15148
Changes from all commits
c5921dc
ab166fe
b5d385a
6a4cdea
029865f
869bc5d
c3bab16
41239d1
26807c0
c330f6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,10 +52,55 @@ public class ParquetGroupConverter | |
private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1); | ||
|
||
/** | ||
* See {@link ParquetGroupConverter#convertField(Group, String)} | ||
* https://github.com/apache/drill/blob/2ab46a9411a52f12a0f9acb1144a318059439bc4/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetReaderUtility.java#L89 | ||
*/ | ||
public static final long CORRECT_CORRUPT_DATE_SHIFT = 2 * JULIAN_EPOCH_OFFSET_DAYS; | ||
|
||
private final boolean binaryAsString; | ||
private final boolean convertCorruptDates; | ||
|
||
public ParquetGroupConverter(boolean binaryAsString, boolean convertCorruptDates) | ||
{ | ||
this.binaryAsString = binaryAsString; | ||
this.convertCorruptDates = convertCorruptDates; | ||
} | ||
|
||
/** | ||
* Recursively converts a group into native Java Map | ||
* | ||
* @param g the group | ||
* @return the native Java object | ||
*/ | ||
public Object convertGroup(Group g) | ||
{ | ||
Map<String, Object> retVal = new LinkedHashMap<>(); | ||
|
||
for (Type field : g.getType().getFields()) { | ||
final String fieldName = field.getName(); | ||
retVal.put(fieldName, convertField(g, fieldName)); | ||
} | ||
|
||
return retVal; | ||
} | ||
|
||
Object unwrapListElement(Object o) | ||
{ | ||
if (o instanceof Group) { | ||
Group g = (Group) o; | ||
return convertListElement(g); | ||
} | ||
return o; | ||
} | ||
|
||
/** | ||
* Convert a parquet group field as though it were a map. Logical types of 'list' and 'map' will be transformed | ||
* into java lists and maps respectively ({@link ParquetGroupConverter#convertLogicalList} and | ||
* {@link ParquetGroupConverter#convertLogicalMap}), repeated fields will also be translated to lists, and | ||
* primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally, | ||
* if a field is not present, this method will return null. | ||
*/ | ||
@Nullable | ||
private static Object convertField(Group g, String fieldName, boolean binaryAsString) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just double checking, this Any changes you want to make to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Object convertField(Group g, String fieldName) | ||
{ | ||
if (!g.getType().containsField(fieldName)) { | ||
return null; | ||
|
@@ -76,22 +121,22 @@ private static Object convertField(Group g, String fieldName, boolean binaryAsSt | |
int repeated = g.getFieldRepetitionCount(fieldIndex); | ||
List<Object> vals = new ArrayList<>(); | ||
for (int i = 0; i < repeated; i++) { | ||
vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); | ||
vals.add(convertPrimitiveField(g, fieldIndex, i)); | ||
} | ||
return vals; | ||
} | ||
return convertPrimitiveField(g, fieldIndex, binaryAsString); | ||
return convertPrimitiveField(g, fieldIndex); | ||
} else { | ||
if (fieldType.isRepetition(Type.Repetition.REPEATED)) { | ||
return convertRepeatedFieldToList(g, fieldIndex, binaryAsString); | ||
return convertRepeatedFieldToList(g, fieldIndex); | ||
} | ||
|
||
if (isLogicalMapType(fieldType)) { | ||
return convertLogicalMap(g.getGroup(fieldIndex, 0), binaryAsString); | ||
return convertLogicalMap(g.getGroup(fieldIndex, 0)); | ||
} | ||
|
||
if (isLogicalListType(fieldType)) { | ||
return convertLogicalList(g.getGroup(fieldIndex, 0), binaryAsString); | ||
return convertLogicalList(g.getGroup(fieldIndex, 0)); | ||
} | ||
|
||
// not a list, but not a primitive, return the nested group type | ||
|
@@ -102,7 +147,7 @@ private static Object convertField(Group g, String fieldName, boolean binaryAsSt | |
/** | ||
* convert a repeated field into a list of primitives or groups | ||
*/ | ||
private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) | ||
private List<Object> convertRepeatedFieldToList(Group g, int fieldIndex) | ||
{ | ||
|
||
Type t = g.getType().getFields().get(fieldIndex); | ||
|
@@ -111,7 +156,7 @@ private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, | |
List<Object> vals = new ArrayList<>(); | ||
for (int i = 0; i < repeated; i++) { | ||
if (t.isPrimitive()) { | ||
vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); | ||
vals.add(convertPrimitiveField(g, fieldIndex, i)); | ||
} else { | ||
vals.add(g.getGroup(fieldIndex, i)); | ||
} | ||
|
@@ -134,7 +179,7 @@ private static boolean isLogicalListType(Type listType) | |
/** | ||
* convert a parquet 'list' logical type {@link Group} to a java list of primitives or groups | ||
*/ | ||
private static List<Object> convertLogicalList(Group g, boolean binaryAsString) | ||
private List<Object> convertLogicalList(Group g) | ||
{ | ||
/* | ||
// List<Integer> (nullable list, non-null elements) | ||
|
@@ -181,16 +226,16 @@ optional group my_list (LIST) { | |
|
||
for (int i = 0; i < repeated; i++) { | ||
if (isListItemPrimitive) { | ||
vals.add(convertPrimitiveField(g, 0, i, binaryAsString)); | ||
vals.add(convertPrimitiveField(g, 0, i)); | ||
} else { | ||
Group listItem = g.getGroup(0, i); | ||
vals.add(convertListElement(listItem, binaryAsString)); | ||
vals.add(convertListElement(listItem)); | ||
} | ||
} | ||
return vals; | ||
} | ||
|
||
private static Object convertListElement(Group listItem, boolean binaryAsString) | ||
private Object convertListElement(Group listItem) | ||
{ | ||
if ( | ||
listItem.getType().isRepetition(Type.Repetition.REPEATED) && | ||
|
@@ -199,7 +244,7 @@ private static Object convertListElement(Group listItem, boolean binaryAsString) | |
listItem.getType().getFields().get(0).isPrimitive() | ||
) { | ||
// nullable primitive list elements can have a repeating wrapper element, peel it off | ||
return convertPrimitiveField(listItem, 0, binaryAsString); | ||
return convertPrimitiveField(listItem, 0); | ||
} else if ( | ||
listItem.getType().isRepetition(Type.Repetition.REPEATED) && | ||
listItem.getType().getFieldCount() == 1 && | ||
|
@@ -244,7 +289,7 @@ private static boolean isLogicalMapType(Type groupType) | |
/** | ||
* Convert a parquet 'map' logical type {@link Group} to a java map of string keys to groups/lists/primitive values | ||
*/ | ||
private static Map<String, Object> convertLogicalMap(Group g, boolean binaryAsString) | ||
private Map<String, Object> convertLogicalMap(Group g) | ||
{ | ||
/* | ||
// Map<String, Integer> (nullable map, non-null values) | ||
|
@@ -268,8 +313,8 @@ optional group my_map (MAP_KEY_VALUE) {( | |
Map<String, Object> converted = new HashMap<>(); | ||
for (int i = 0; i < mapEntries; i++) { | ||
Group mapEntry = g.getGroup(0, i); | ||
String key = convertPrimitiveField(mapEntry, 0, binaryAsString).toString(); | ||
Object value = convertField(mapEntry, "value", binaryAsString); | ||
String key = convertPrimitiveField(mapEntry, 0).toString(); | ||
Object value = convertField(mapEntry, "value"); | ||
converted.put(key, value); | ||
} | ||
return converted; | ||
|
@@ -281,17 +326,17 @@ optional group my_map (MAP_KEY_VALUE) {( | |
* @return "ingestion ready" java object, or null | ||
*/ | ||
@Nullable | ||
private static Object convertPrimitiveField(Group g, int fieldIndex, boolean binaryAsString) | ||
private Object convertPrimitiveField(Group g, int fieldIndex) | ||
{ | ||
PrimitiveType pt = (PrimitiveType) g.getType().getFields().get(fieldIndex); | ||
if (pt.isRepetition(Type.Repetition.REPEATED) && g.getFieldRepetitionCount(fieldIndex) > 1) { | ||
List<Object> vals = new ArrayList<>(); | ||
for (int i = 0; i < g.getFieldRepetitionCount(fieldIndex); i++) { | ||
vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); | ||
vals.add(convertPrimitiveField(g, fieldIndex, i)); | ||
} | ||
return vals; | ||
} | ||
return convertPrimitiveField(g, fieldIndex, 0, binaryAsString); | ||
return convertPrimitiveField(g, fieldIndex, 0); | ||
} | ||
|
||
/** | ||
|
@@ -300,7 +345,7 @@ private static Object convertPrimitiveField(Group g, int fieldIndex, boolean bin | |
* @return "ingestion ready" java object, or null | ||
*/ | ||
@Nullable | ||
private static Object convertPrimitiveField(Group g, int fieldIndex, int index, boolean binaryAsString) | ||
private Object convertPrimitiveField(Group g, int fieldIndex, int index) | ||
{ | ||
PrimitiveType pt = (PrimitiveType) g.getType().getFields().get(fieldIndex); | ||
OriginalType ot = pt.getOriginalType(); | ||
|
@@ -310,7 +355,7 @@ private static Object convertPrimitiveField(Group g, int fieldIndex, int index, | |
// convert logical types | ||
switch (ot) { | ||
case DATE: | ||
long ts = g.getInteger(fieldIndex, index) * MILLIS_IN_DAY; | ||
long ts = convertDateToMillis(g.getInteger(fieldIndex, index)); | ||
return ts; | ||
case TIME_MICROS: | ||
return g.getLong(fieldIndex, index); | ||
|
@@ -443,6 +488,14 @@ private static Object convertPrimitiveField(Group g, int fieldIndex, int index, | |
} | ||
} | ||
|
||
private long convertDateToMillis(int value) | ||
{ | ||
if (convertCorruptDates) { | ||
value -= CORRECT_CORRUPT_DATE_SHIFT; | ||
} | ||
return value * MILLIS_IN_DAY; | ||
} | ||
|
||
/** | ||
* convert deprecated parquet int96 nanosecond timestamp to a long, based on | ||
* https://github.com/prestodb/presto/blob/master/presto-parquet/src/main/java/com/facebook/presto/parquet/ParquetTimestampUtils.java#L44 | ||
|
@@ -490,51 +543,4 @@ private static BigDecimal convertBinaryToDecimal(Binary value, int precision, in | |
return new BigDecimal(new BigInteger(value.getBytes()), scale); | ||
} | ||
} | ||
|
||
private final boolean binaryAsString; | ||
|
||
public ParquetGroupConverter(boolean binaryAsString) | ||
{ | ||
this.binaryAsString = binaryAsString; | ||
} | ||
|
||
/** | ||
* Recursively converts a group into native Java Map | ||
* | ||
* @param g the group | ||
* @return the native Java object | ||
*/ | ||
public Object convertGroup(Group g) | ||
{ | ||
Map<String, Object> retVal = new LinkedHashMap<>(); | ||
|
||
for (Type field : g.getType().getFields()) { | ||
final String fieldName = field.getName(); | ||
retVal.put(fieldName, convertField(g, fieldName)); | ||
} | ||
|
||
return retVal; | ||
} | ||
|
||
/** | ||
* Convert a parquet group field as though it were a map. Logical types of 'list' and 'map' will be transformed | ||
* into java lists and maps respectively ({@link ParquetGroupConverter#convertLogicalList} and | ||
* {@link ParquetGroupConverter#convertLogicalMap}), repeated fields will also be translated to lists, and | ||
* primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally, | ||
* if a field is not present, this method will return null. | ||
*/ | ||
@Nullable | ||
Object convertField(Group g, String fieldName) | ||
{ | ||
return convertField(g, fieldName, binaryAsString); | ||
} | ||
|
||
Object unwrapListElement(Object o) | ||
{ | ||
if (o instanceof Group) { | ||
Group g = (Group) o; | ||
return convertListElement(g, binaryAsString); | ||
} | ||
return o; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a design nit: with the change to remove the
static
label from the methods, I would expect them to start coming after the constructor. That is, we tend to follow the code flow of static first, then constructor then class methods. I noticed that it was inverted because I kept searching for the constructor at the top of the file and didn't see it, and then realized it was at the bottom and that's because the methods used to be static but now are not.The current structure reads really nicely for the diff though, I hope that the whitespace change that I'm nit picking doesn't screw up the diff...