Skip to content

Commit

Permalink
store auto columns with only empty or null containing arrays as ARRAY…
Browse files Browse the repository at this point in the history
…<LONG> instead of COMPLEX<json> (apache#15505)
  • Loading branch information
clintropolis authored Dec 7, 2023
1 parent 801967b commit c241c69
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ public SortedMap<String, FieldTypeInfo.MutableTypeSet> getFieldTypeInfo()
final TreeMap<String, FieldTypeInfo.MutableTypeSet> fields = new TreeMap<>();
for (Map.Entry<String, FieldIndexer> entry : fieldIndexers.entrySet()) {
// skip adding the field if no types are in the set, meaning only null values have been processed
if (!entry.getValue().getTypes().isEmpty()) {
if (!entry.getValue().getTypes().isEmpty() || entry.getValue().getTypes().hasUntypedArray()) {
fields.put(entry.getKey(), entry.getValue().getTypes());
}
}
Expand Down Expand Up @@ -421,6 +421,10 @@ public ColumnType getLogicalType()
}
return ColumnTypeFactory.getInstance().ofArray(logicalType);
}
// if we only have empty an null arrays, ARRAY<LONG> is the most restrictive type we can pick
if (rootField.getTypes().hasUntypedArray()) {
return ColumnType.LONG_ARRAY;
}
}
return ColumnType.NESTED_DATA;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ public ColumnType getSingleType()
if (hasEmptyArray && columnType != null && !columnType.isArray()) {
return null;
}
// if column only has empty arrays, call it long array
if (types == 0x00 && hasEmptyArray) {
return ColumnType.LONG_ARRAY;
}
return columnType;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
Expand Down Expand Up @@ -438,6 +439,30 @@ public void testGroupByRootArrayLongElementString()
);
}

@Test
public void testGroupByEmptyIshArrays()
{
GroupByQuery groupQuery = GroupByQuery.builder()
.setDataSource("test_datasource")
.setGranularity(Granularities.ALL)
.setInterval(Intervals.ETERNITY)
.setDimensions(DefaultDimensionSpec.of("arrayNoType", ColumnType.LONG_ARRAY))
.setAggregatorSpecs(new CountAggregatorFactory("count"))
.setContext(getContext())
.build();


runResults(
groupQuery,
ImmutableList.of(
new Object[]{null, 4L},
new Object[]{new ComparableList<>(Collections.emptyList()), 18L},
new Object[]{new ComparableList<>(Collections.singletonList(null)), 4L},
new Object[]{new ComparableList<>(Arrays.asList(null, null)), 2L}
)
);
}

private void runResults(
GroupByQuery groupQuery,
List<Object[]> expectedResults
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ public void testConstantEmptyArray()
Assert.assertFalse(indexer.hasNulls);
Assert.assertFalse(indexer.hasNestedData);
Assert.assertTrue(indexer.isConstant());
Assert.assertEquals(ColumnType.NESTED_DATA, indexer.getLogicalType());
Assert.assertEquals(ColumnType.LONG_ARRAY, indexer.getLogicalType());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,21 @@ public void testMultiType() throws IOException
}
}

@Test
public void testOnlyEmptyType()
{
FieldTypeInfo.MutableTypeSet typeSet = new FieldTypeInfo.MutableTypeSet();
Assert.assertNull(typeSet.getSingleType());
Assert.assertTrue(typeSet.isEmpty());

typeSet.addUntypedArray();

Assert.assertEquals(ColumnType.LONG_ARRAY, typeSet.getSingleType());
// no actual types in the type set, only getSingleType
Assert.assertEquals(ImmutableSet.of(), FieldTypeInfo.convertToSet(typeSet.getByteValue()));
Assert.assertTrue(typeSet.hasUntypedArray());
}

@Test
public void testEqualsAndHashCode()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,13 @@ public class VariantColumnSupplierTest extends InitializedNullHandlingTest
Arrays.asList(null, 3.3)
);

static List<List<Object>> NO_TYPE_ARRAY = Arrays.asList(
Collections.emptyList(),
null,
Collections.emptyList(),
Arrays.asList(null, null)
);


@BeforeClass
public static void staticSetup()
Expand Down Expand Up @@ -186,7 +193,9 @@ public static Collection<?> constructorFeeder()
new Object[]{"ARRAY<LONG>,ARRAY<STRING>,DOUBLE,LONG,STRING", VARIANT_SCALAR_AND_ARRAY, IndexSpec.DEFAULT},
new Object[]{"ARRAY<LONG>,ARRAY<STRING>,DOUBLE,LONG,STRING", VARIANT_SCALAR_AND_ARRAY, fancy},
new Object[]{"ARRAY<DOUBLE>,ARRAY<LONG>,ARRAY<STRING>", VARIANT_ARRAY, IndexSpec.DEFAULT},
new Object[]{"ARRAY<DOUBLE>,ARRAY<LONG>,ARRAY<STRING>", VARIANT_ARRAY, fancy}
new Object[]{"ARRAY<DOUBLE>,ARRAY<LONG>,ARRAY<STRING>", VARIANT_ARRAY, fancy},
new Object[]{"ARRAY<LONG>", NO_TYPE_ARRAY, IndexSpec.DEFAULT},
new Object[]{"ARRAY<LONG>", NO_TYPE_ARRAY, fancy}
);

return constructors;
Expand Down Expand Up @@ -254,6 +263,9 @@ private SmooshedFileMapper smooshify(
for (ColumnType type : FieldTypeInfo.convertToSet(expectedTypes.getByteValue())) {
expectedLogicalType = ColumnType.leastRestrictiveType(expectedLogicalType, type);
}
if (expectedLogicalType == null && sortedFields.get(NestedPathFinder.JSON_PATH_ROOT).hasUntypedArray()) {
expectedLogicalType = ColumnType.LONG_ARRAY;
}
VariantColumnSerializer serializer = new VariantColumnSerializer(
fileNameBase,
expectedTypes.getSingleType() == null ? null : expectedLogicalType,
Expand Down
28 changes: 14 additions & 14 deletions processing/src/test/resources/nested-array-test-data.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": ["a", "b"], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[1, 2, null], [3, 4]], "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNestedLong":[null, [null], []], "arrayObject":[{"x": 3},{"x":4}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1, 2, 3]], "arrayObject":[null,{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[1], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":[], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":null, "arrayObject":[{"x": 1000},{"y":2000}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": null, "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[1], [1, 2, null]], "arrayObject":[{"a": 1},{"b":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null], "arrayVariant":null, "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[2, 3], [1, 5]], "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNestedLong":[null], "arrayObject":[{"x": 3},{"x":4}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1]], "arrayObject":[null,{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[null], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":null, "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[], "arrayObject":[{"x": 1000},{"y":2000}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [null], "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[], "arrayObject":[{"a": 1},{"b":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null, 1.1], "arrayVariant":null, "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": ["a", "b"], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[], "arrayNestedLong":[[1, 2, null], [3, 4]], "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNoType":[null], "arrayNestedLong":[null, [null], []], "arrayObject":[{"x": 3},{"x":4}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1, 2, 3]], "arrayObject":[null,{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[1], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNoType":[], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":[], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":null, "arrayNoType":[], "arrayObject":[{"x": 1000},{"y":2000}]}
{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": null, "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[null, null], "arrayNestedLong":[[1], [1, 2, null]], "arrayObject":[{"a": 1},{"b":2}]}
{"timestamp": "2023-01-01T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null], "arrayVariant":null, "arrayNoType":[], "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[], "arrayNestedLong":[[2, 3], [1, 5]], "arrayObject":[{"x": 1},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNoType":[], "arrayNestedLong":[null], "arrayObject":[{"x": 3},{"x":4}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNoType":[null], "arrayNestedLong":[[1], null, [1]], "arrayObject":[null,{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[null], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNoType":[], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":null, "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[], "arrayNoType":[], "arrayObject":[{"x": 1000},{"y":2000}]}
{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [null], "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":null, "arrayNestedLong":[], "arrayObject":[{"a": 1},{"b":2}]}
{"timestamp": "2023-01-02T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null, 1.1], "arrayVariant":null, "arrayNoType":[], "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]}
Original file line number Diff line number Diff line change
Expand Up @@ -6258,9 +6258,9 @@ public void testScanAllTypesAuto()
.add("cstringArray", ColumnType.STRING_ARRAY)
.add("cLongArray", ColumnType.LONG_ARRAY)
.add("cDoubleArray", ColumnType.DOUBLE_ARRAY)
.add("cEmptyArray", ColumnType.NESTED_DATA)
.add("cEmptyArray", ColumnType.LONG_ARRAY)
.add("cEmptyObj", ColumnType.NESTED_DATA)
.add("cNullArray", ColumnType.NESTED_DATA)
.add("cNullArray", ColumnType.LONG_ARRAY)
.add("cEmptyObjectArray", ColumnType.NESTED_DATA)
.add("cObjectArray", ColumnType.NESTED_DATA)
.add("cnt", ColumnType.LONG)
Expand Down

0 comments on commit c241c69

Please sign in to comment.