From c241c6980c9e5e656da252649afb9cb5d01d31a7 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Thu, 7 Dec 2023 03:31:43 -0800 Subject: [PATCH] store auto columns with only empty or null containing arrays as ARRAY instead of COMPLEX (#15505) --- .../druid/segment/AutoTypeColumnIndexer.java | 6 +++- .../druid/segment/nested/FieldTypeInfo.java | 4 +++ .../groupby/NestedGroupByArrayQueryTest.java | 25 +++++++++++++++++ .../segment/AutoTypeColumnIndexerTest.java | 2 +- .../nested/NestedFieldTypeInfoTest.java | 15 ++++++++++ .../nested/VariantColumnSupplierTest.java | 14 +++++++++- .../resources/nested-array-test-data.json | 28 +++++++++---------- .../calcite/CalciteNestedDataQueryTest.java | 4 +-- 8 files changed, 79 insertions(+), 19 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/segment/AutoTypeColumnIndexer.java b/processing/src/main/java/org/apache/druid/segment/AutoTypeColumnIndexer.java index 3ccde4221ae9..3e47ef2da9e1 100644 --- a/processing/src/main/java/org/apache/druid/segment/AutoTypeColumnIndexer.java +++ b/processing/src/main/java/org/apache/druid/segment/AutoTypeColumnIndexer.java @@ -259,7 +259,7 @@ public SortedMap getFieldTypeInfo() final TreeMap fields = new TreeMap<>(); for (Map.Entry entry : fieldIndexers.entrySet()) { // skip adding the field if no types are in the set, meaning only null values have been processed - if (!entry.getValue().getTypes().isEmpty()) { + if (!entry.getValue().getTypes().isEmpty() || entry.getValue().getTypes().hasUntypedArray()) { fields.put(entry.getKey(), entry.getValue().getTypes()); } } @@ -421,6 +421,10 @@ public ColumnType getLogicalType() } return ColumnTypeFactory.getInstance().ofArray(logicalType); } + // if we only have empty an null arrays, ARRAY is the most restrictive type we can pick + if (rootField.getTypes().hasUntypedArray()) { + return ColumnType.LONG_ARRAY; + } } return ColumnType.NESTED_DATA; } diff --git a/processing/src/main/java/org/apache/druid/segment/nested/FieldTypeInfo.java b/processing/src/main/java/org/apache/druid/segment/nested/FieldTypeInfo.java index 15691cfc9c4c..b832e3d73506 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/FieldTypeInfo.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/FieldTypeInfo.java @@ -183,6 +183,10 @@ public ColumnType getSingleType() if (hasEmptyArray && columnType != null && !columnType.isArray()) { return null; } + // if column only has empty arrays, call it long array + if (types == 0x00 && hasEmptyArray) { + return ColumnType.LONG_ARRAY; + } return columnType; } diff --git a/processing/src/test/java/org/apache/druid/query/groupby/NestedGroupByArrayQueryTest.java b/processing/src/test/java/org/apache/druid/query/groupby/NestedGroupByArrayQueryTest.java index a43509de9212..afefe73adf70 100644 --- a/processing/src/test/java/org/apache/druid/query/groupby/NestedGroupByArrayQueryTest.java +++ b/processing/src/test/java/org/apache/druid/query/groupby/NestedGroupByArrayQueryTest.java @@ -53,6 +53,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.function.BiFunction; @@ -438,6 +439,30 @@ public void testGroupByRootArrayLongElementString() ); } + @Test + public void testGroupByEmptyIshArrays() + { + GroupByQuery groupQuery = GroupByQuery.builder() + .setDataSource("test_datasource") + .setGranularity(Granularities.ALL) + .setInterval(Intervals.ETERNITY) + .setDimensions(DefaultDimensionSpec.of("arrayNoType", ColumnType.LONG_ARRAY)) + .setAggregatorSpecs(new CountAggregatorFactory("count")) + .setContext(getContext()) + .build(); + + + runResults( + groupQuery, + ImmutableList.of( + new Object[]{null, 4L}, + new Object[]{new ComparableList<>(Collections.emptyList()), 18L}, + new Object[]{new ComparableList<>(Collections.singletonList(null)), 4L}, + new Object[]{new ComparableList<>(Arrays.asList(null, null)), 2L} + ) + ); + } + private void runResults( GroupByQuery groupQuery, List expectedResults diff --git a/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java b/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java index 6d2d29806dff..8058980e17f4 100644 --- a/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/AutoTypeColumnIndexerTest.java @@ -760,7 +760,7 @@ public void testConstantEmptyArray() Assert.assertFalse(indexer.hasNulls); Assert.assertFalse(indexer.hasNestedData); Assert.assertTrue(indexer.isConstant()); - Assert.assertEquals(ColumnType.NESTED_DATA, indexer.getLogicalType()); + Assert.assertEquals(ColumnType.LONG_ARRAY, indexer.getLogicalType()); } @Test diff --git a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldTypeInfoTest.java b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldTypeInfoTest.java index 33df1887ea54..4c7020c75233 100644 --- a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldTypeInfoTest.java +++ b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldTypeInfoTest.java @@ -89,6 +89,21 @@ public void testMultiType() throws IOException } } + @Test + public void testOnlyEmptyType() + { + FieldTypeInfo.MutableTypeSet typeSet = new FieldTypeInfo.MutableTypeSet(); + Assert.assertNull(typeSet.getSingleType()); + Assert.assertTrue(typeSet.isEmpty()); + + typeSet.addUntypedArray(); + + Assert.assertEquals(ColumnType.LONG_ARRAY, typeSet.getSingleType()); + // no actual types in the type set, only getSingleType + Assert.assertEquals(ImmutableSet.of(), FieldTypeInfo.convertToSet(typeSet.getByteValue())); + Assert.assertTrue(typeSet.hasUntypedArray()); + } + @Test public void testEqualsAndHashCode() { diff --git a/processing/src/test/java/org/apache/druid/segment/nested/VariantColumnSupplierTest.java b/processing/src/test/java/org/apache/druid/segment/nested/VariantColumnSupplierTest.java index 7910e49652f8..4604aff1c204 100644 --- a/processing/src/test/java/org/apache/druid/segment/nested/VariantColumnSupplierTest.java +++ b/processing/src/test/java/org/apache/druid/segment/nested/VariantColumnSupplierTest.java @@ -156,6 +156,13 @@ public class VariantColumnSupplierTest extends InitializedNullHandlingTest Arrays.asList(null, 3.3) ); + static List> NO_TYPE_ARRAY = Arrays.asList( + Collections.emptyList(), + null, + Collections.emptyList(), + Arrays.asList(null, null) + ); + @BeforeClass public static void staticSetup() @@ -186,7 +193,9 @@ public static Collection constructorFeeder() new Object[]{"ARRAY,ARRAY,DOUBLE,LONG,STRING", VARIANT_SCALAR_AND_ARRAY, IndexSpec.DEFAULT}, new Object[]{"ARRAY,ARRAY,DOUBLE,LONG,STRING", VARIANT_SCALAR_AND_ARRAY, fancy}, new Object[]{"ARRAY,ARRAY,ARRAY", VARIANT_ARRAY, IndexSpec.DEFAULT}, - new Object[]{"ARRAY,ARRAY,ARRAY", VARIANT_ARRAY, fancy} + new Object[]{"ARRAY,ARRAY,ARRAY", VARIANT_ARRAY, fancy}, + new Object[]{"ARRAY", NO_TYPE_ARRAY, IndexSpec.DEFAULT}, + new Object[]{"ARRAY", NO_TYPE_ARRAY, fancy} ); return constructors; @@ -254,6 +263,9 @@ private SmooshedFileMapper smooshify( for (ColumnType type : FieldTypeInfo.convertToSet(expectedTypes.getByteValue())) { expectedLogicalType = ColumnType.leastRestrictiveType(expectedLogicalType, type); } + if (expectedLogicalType == null && sortedFields.get(NestedPathFinder.JSON_PATH_ROOT).hasUntypedArray()) { + expectedLogicalType = ColumnType.LONG_ARRAY; + } VariantColumnSerializer serializer = new VariantColumnSerializer( fileNameBase, expectedTypes.getSingleType() == null ? null : expectedLogicalType, diff --git a/processing/src/test/resources/nested-array-test-data.json b/processing/src/test/resources/nested-array-test-data.json index b8ae3ace3847..430fe165eac5 100644 --- a/processing/src/test/resources/nested-array-test-data.json +++ b/processing/src/test/resources/nested-array-test-data.json @@ -1,14 +1,14 @@ -{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": ["a", "b"], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[1, 2, null], [3, 4]], "arrayObject":[{"x": 1},{"x":2}]} -{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNestedLong":[null, [null], []], "arrayObject":[{"x": 3},{"x":4}]} -{"timestamp": "2023-01-01T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1, 2, 3]], "arrayObject":[null,{"x":2}]} -{"timestamp": "2023-01-01T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[1], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]} -{"timestamp": "2023-01-01T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":[], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":null, "arrayObject":[{"x": 1000},{"y":2000}]} -{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": null, "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[1], [1, 2, null]], "arrayObject":[{"a": 1},{"b":2}]} -{"timestamp": "2023-01-01T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null], "arrayVariant":null, "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[[2, 3], [1, 5]], "arrayObject":[{"x": 1},{"x":2}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNestedLong":[null], "arrayObject":[{"x": 3},{"x":4}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1]], "arrayObject":[null,{"x":2}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[null], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":null, "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[], "arrayObject":[{"x": 1000},{"y":2000}]} -{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [null], "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNestedLong":[], "arrayObject":[{"a": 1},{"b":2}]} -{"timestamp": "2023-01-02T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null, 1.1], "arrayVariant":null, "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]} \ No newline at end of file +{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": ["a", "b"], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[], "arrayNestedLong":[[1, 2, null], [3, 4]], "arrayObject":[{"x": 1},{"x":2}]} +{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNoType":[null], "arrayNestedLong":[null, [null], []], "arrayObject":[{"x": 3},{"x":4}]} +{"timestamp": "2023-01-01T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNestedLong":[[1], null, [1, 2, 3]], "arrayObject":[null,{"x":2}]} +{"timestamp": "2023-01-01T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[1], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNoType":[], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]} +{"timestamp": "2023-01-01T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":[], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":null, "arrayNoType":[], "arrayObject":[{"x": 1000},{"y":2000}]} +{"timestamp": "2023-01-01T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": null, "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[null, null], "arrayNestedLong":[[1], [1, 2, null]], "arrayObject":[{"a": 1},{"b":2}]} +{"timestamp": "2023-01-01T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null], "arrayVariant":null, "arrayNoType":[], "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [], "arrayLong":[1, 2, 3], "arrayLongNulls":[1, null,3], "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[1.1, 2.2, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":[], "arrayNestedLong":[[2, 3], [1, 5]], "arrayObject":[{"x": 1},{"x":2}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b", "c"], "arrayStringNulls": [null, "b"], "arrayLong":[2, 3], "arrayDouble":[3.3, 4.4, 5.5], "arrayDoubleNulls":[999, null, 5.5], "arrayVariant":[null, null, 2.2], "arrayNoType":[], "arrayNestedLong":[null], "arrayObject":[{"x": 3},{"x":4}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": ["b", "c"], "arrayStringNulls": ["d", null, "b"], "arrayLong":[1, 2, 3, 4], "arrayLongNulls":[1, 2, 3], "arrayDouble":[1.1, 3.3], "arrayDoubleNulls":[null, 2.2, null], "arrayVariant":[1, null, 1], "arrayNoType":[null], "arrayNestedLong":[[1], null, [1]], "arrayObject":[null,{"x":2}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": ["d", "e"], "arrayStringNulls": ["b", "b"], "arrayLong":[1, 4], "arrayLongNulls":[null], "arrayDouble":[2.2, 3.3, 4.0], "arrayVariant":["a", "b", "c"], "arrayNoType":[], "arrayNestedLong":[[1, 2], [3, 4], [5, 6, 7]], "arrayObject":[{"x": null},{"x":2}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": null, "arrayLong":[1, 2, 3], "arrayLongNulls":null, "arrayDouble":[1.1, 2.2, 3.3], "arrayDoubleNulls":[], "arrayNoType":[], "arrayObject":[{"x": 1000},{"y":2000}]} +{"timestamp": "2023-01-02T00:00:00", "arrayString": ["a", "b"], "arrayStringNulls": [null], "arrayLongNulls":[null, 2, 9], "arrayDouble":null, "arrayDoubleNulls":[999, 5.5, null], "arrayVariant":["a", 1, 2.2], "arrayNoType":null, "arrayNestedLong":[], "arrayObject":[{"a": 1},{"b":2}]} +{"timestamp": "2023-01-02T00:00:00", "arrayStringNulls": ["a", "b"], "arrayLong":null, "arrayLongNulls":[2, 3], "arrayDoubleNulls":[null, 1.1], "arrayVariant":null, "arrayNoType":[], "arrayNestedLong":null, "arrayObject":[{"x": 1},{"x":2}]} \ No newline at end of file diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index 22e16f6e7e1f..cea1bdbef7e1 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -6258,9 +6258,9 @@ public void testScanAllTypesAuto() .add("cstringArray", ColumnType.STRING_ARRAY) .add("cLongArray", ColumnType.LONG_ARRAY) .add("cDoubleArray", ColumnType.DOUBLE_ARRAY) - .add("cEmptyArray", ColumnType.NESTED_DATA) + .add("cEmptyArray", ColumnType.LONG_ARRAY) .add("cEmptyObj", ColumnType.NESTED_DATA) - .add("cNullArray", ColumnType.NESTED_DATA) + .add("cNullArray", ColumnType.LONG_ARRAY) .add("cEmptyObjectArray", ColumnType.NESTED_DATA) .add("cObjectArray", ColumnType.NESTED_DATA) .add("cnt", ColumnType.LONG)