From 3fb6ba22e85b82a654cbfb1b52b5ae7fed7b95e0 Mon Sep 17 00:00:00 2001 From: Clint Wylie Date: Sat, 8 Jun 2024 13:05:19 -0700 Subject: [PATCH] fix expression column capabilities to not report dictionary encoded unless input is string (#16577) --- .../NestedCommonFormatColumnPartSerde.java | 4 - .../druid/segment/virtual/ExpressionPlan.java | 16 ++- .../virtual/ExpressionPlannerTest.java | 126 ++++++++++++++++-- .../virtual/ExpressionVirtualColumnTest.java | 2 +- .../calcite/CalciteNestedDataQueryTest.java | 32 +++++ 5 files changed, 157 insertions(+), 23 deletions(-) diff --git a/processing/src/main/java/org/apache/druid/segment/serde/NestedCommonFormatColumnPartSerde.java b/processing/src/main/java/org/apache/druid/segment/serde/NestedCommonFormatColumnPartSerde.java index 7a0ea7437846..6bf897cd6b2b 100644 --- a/processing/src/main/java/org/apache/druid/segment/serde/NestedCommonFormatColumnPartSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/serde/NestedCommonFormatColumnPartSerde.java @@ -303,10 +303,6 @@ public void read(ByteBuffer buffer, ColumnBuilder builder, ColumnConfig columnCo bitmapSerdeFactory, byteOrder ); - ColumnCapabilitiesImpl capabilitiesBuilder = builder.getCapabilitiesBuilder(); - capabilitiesBuilder.setDictionaryEncoded(true); - capabilitiesBuilder.setDictionaryValuesSorted(true); - capabilitiesBuilder.setDictionaryValuesUnique(true); ColumnType simpleType = supplier.getLogicalType(); ColumnType logicalType = simpleType == null ? ColumnType.NESTED_DATA : simpleType; builder.setType(logicalType); diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionPlan.java b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionPlan.java index fecfe142069b..71af403fa76f 100644 --- a/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionPlan.java +++ b/processing/src/main/java/org/apache/druid/segment/virtual/ExpressionPlan.java @@ -274,12 +274,18 @@ public ColumnCapabilities inferColumnCapabilities(@Nullable ColumnType outputTyp // since we don't know if the expression is 1:1 or if it retains ordering we can only piggy back only // report as dictionary encoded, but it still allows us to use algorithms which work with dictionaryIds // to create a dictionary encoded selector instead of an object selector to defer expression evaluation - // until query time - return ColumnCapabilitiesImpl.copyOf(underlyingCapabilities) + // until query time. However, currently dictionary encodedness is tied to string selectors and sad stuff + // happens if the input type isn't string, so we also limit this to string input types + final boolean useDictionary = underlyingCapabilities.isDictionaryEncoded().isTrue() && + underlyingCapabilities.is(ValueType.STRING); + return ColumnCapabilitiesImpl.createDefault() .setType(ColumnType.STRING) - .setDictionaryValuesSorted(false) - .setDictionaryValuesUnique(false) - .setHasBitmapIndexes(false) + .setDictionaryEncoded(useDictionary) + .setHasBitmapIndexes(underlyingCapabilities.hasBitmapIndexes()) + .setHasMultipleValues(underlyingCapabilities.hasMultipleValues()) + .setHasSpatialIndexes(underlyingCapabilities.hasSpatialIndexes()) + // we aren't sure if the expression might produce null values or not, so always + // set hasNulls to true .setHasNulls(true); } } diff --git a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionPlannerTest.java b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionPlannerTest.java index c40c4a9bacd8..9b4d1b84af20 100644 --- a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionPlannerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionPlannerTest.java @@ -19,8 +19,13 @@ package org.apache.druid.segment.virtual; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.error.DruidException; +import org.apache.druid.math.expr.Expr; +import org.apache.druid.math.expr.ExprEval; +import org.apache.druid.math.expr.ExprMacroTable; import org.apache.druid.math.expr.ExpressionType; import org.apache.druid.math.expr.Parser; import org.apache.druid.query.expression.TestExprMacroTable; @@ -36,11 +41,13 @@ import org.junit.rules.ExpectedException; import javax.annotation.Nullable; +import java.util.List; import java.util.Map; public class ExpressionPlannerTest extends InitializedNullHandlingTest { - public static final ColumnInspector SYNTHETIC_INSPECTOR = new ColumnInspector() + private static ColumnType DICTIONARY_COMPLEX = ColumnType.ofComplex("dictionaryComplex"); + private static final ColumnInspector SYNTHETIC_INSPECTOR = new ColumnInspector() { private final Map capabilitiesMap = ImmutableMap.builder() @@ -141,6 +148,12 @@ public class ExpressionPlannerTest extends InitializedNullHandlingTest "double_array_2", ColumnCapabilitiesImpl.createSimpleArrayColumnCapabilities(ColumnType.DOUBLE_ARRAY) ) + .put( + "dictionary_complex", + ColumnCapabilitiesImpl.createDefault() + .setDictionaryEncoded(true) + .setType(DICTIONARY_COMPLEX) + ) .build(); @Nullable @@ -151,6 +164,8 @@ public ColumnCapabilities getColumnCapabilities(String column) } }; + private static final TestMacroTable MACRO_TABLE = new TestMacroTable(); + @Rule public ExpectedException expectedException = ExpectedException.none(); @@ -369,7 +384,7 @@ public void testScalarStringDictionaryEncoded() Assert.assertFalse(inferred.areDictionaryValuesSorted().isMaybeTrue()); Assert.assertFalse(inferred.areDictionaryValuesUnique().isMaybeTrue()); Assert.assertFalse(inferred.hasMultipleValues().isMaybeTrue()); - Assert.assertFalse(inferred.hasBitmapIndexes()); + Assert.assertTrue(inferred.hasBitmapIndexes()); Assert.assertFalse(inferred.hasSpatialIndexes()); // multiple input columns @@ -463,7 +478,7 @@ public void testMultiValueStringDictionaryEncoded() Assert.assertFalse(inferred.areDictionaryValuesSorted().isMaybeTrue()); Assert.assertFalse(inferred.areDictionaryValuesUnique().isMaybeTrue()); Assert.assertTrue(inferred.hasMultipleValues().isTrue()); - Assert.assertFalse(inferred.hasBitmapIndexes()); + Assert.assertTrue(inferred.hasBitmapIndexes()); Assert.assertFalse(inferred.hasSpatialIndexes()); thePlan = plan("concat(scalar_string, multi_dictionary_string_nonunique)"); @@ -599,7 +614,7 @@ public void testIncompleteString() ) ); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.UNKNOWN_INPUTS, @@ -671,7 +686,7 @@ public void testArrayOutput() ) ); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.UNKNOWN_INPUTS, @@ -719,7 +734,22 @@ public void testScalarOutputMultiValueInput() // what about a multi-valued input thePlan = plan("array_to_string(array_append(scalar_string, multi_dictionary_string), ',')"); - assertArrayInput(thePlan); + Assert.assertTrue( + thePlan.is( + ExpressionPlan.Trait.NON_SCALAR_INPUTS, + ExpressionPlan.Trait.NEEDS_APPLIED + ) + ); + Assert.assertFalse( + thePlan.any( + ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, + ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, + ExpressionPlan.Trait.NON_SCALAR_OUTPUT, + ExpressionPlan.Trait.INCOMPLETE_INPUTS, + ExpressionPlan.Trait.UNKNOWN_INPUTS, + ExpressionPlan.Trait.VECTORIZABLE + ) + ); Assert.assertEquals( "array_to_string(map((\"multi_dictionary_string\") -> array_append(\"scalar_string\", \"multi_dictionary_string\"), \"multi_dictionary_string\"), ',')", @@ -789,7 +819,7 @@ public void testArrayConstruction() ) ); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.UNKNOWN_INPUTS, @@ -812,15 +842,14 @@ public void testNestedColumnExpression() { ExpressionPlan thePlan = plan("json_object('long1', long1, 'long2', long2)"); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.NON_SCALAR_OUTPUT, ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.UNKNOWN_INPUTS, ExpressionPlan.Trait.INCOMPLETE_INPUTS, ExpressionPlan.Trait.NEEDS_APPLIED, - ExpressionPlan.Trait.NON_SCALAR_INPUTS, - ExpressionPlan.Trait.VECTORIZABLE + ExpressionPlan.Trait.NON_SCALAR_INPUTS ) ); Assert.assertEquals(ExpressionType.NESTED_DATA, thePlan.getOutputType()); @@ -837,9 +866,40 @@ public void testNestedColumnExpression() ); } + @Test + public void testDictionaryComplexStringOutput() + { + ExpressionPlan thePlan = plan("dict_complex_to_string(dictionary_complex)"); + Assert.assertFalse( + thePlan.any( + ExpressionPlan.Trait.NON_SCALAR_OUTPUT, + ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, + ExpressionPlan.Trait.UNKNOWN_INPUTS, + ExpressionPlan.Trait.INCOMPLETE_INPUTS, + ExpressionPlan.Trait.NEEDS_APPLIED, + ExpressionPlan.Trait.NON_SCALAR_INPUTS + ) + ); + Assert.assertTrue( + thePlan.is( + ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, + ExpressionPlan.Trait.VECTORIZABLE + ) + ); + Assert.assertEquals(ExpressionType.STRING, thePlan.getOutputType()); + ColumnCapabilities inferred = thePlan.inferColumnCapabilities( + ExpressionType.toColumnType(thePlan.getOutputType()) + ); + Assert.assertEquals( + ColumnType.STRING.getType(), + inferred.getType() + ); + Assert.assertFalse(inferred.isDictionaryEncoded().isMaybeTrue()); + } + private static ExpressionPlan plan(String expression) { - return ExpressionPlanner.plan(SYNTHETIC_INSPECTOR, Parser.parse(expression, TestExprMacroTable.INSTANCE)); + return ExpressionPlanner.plan(SYNTHETIC_INSPECTOR, Parser.parse(expression, MACRO_TABLE)); } private static void assertArrayInput(ExpressionPlan thePlan) @@ -850,7 +910,7 @@ private static void assertArrayInput(ExpressionPlan thePlan) ) ); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.NON_SCALAR_OUTPUT, @@ -871,7 +931,7 @@ private static void assertArrayInAndOut(ExpressionPlan thePlan) ) ); Assert.assertFalse( - thePlan.is( + thePlan.any( ExpressionPlan.Trait.SINGLE_INPUT_SCALAR, ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE, ExpressionPlan.Trait.INCOMPLETE_INPUTS, @@ -881,4 +941,44 @@ private static void assertArrayInAndOut(ExpressionPlan thePlan) ) ); } + + private static class TestMacroTable extends ExprMacroTable + { + public TestMacroTable() + { + super( + ImmutableList.builder() + .addAll(TestExprMacroTable.INSTANCE.getMacros()) + .add(new ExprMacroTable.ExprMacro() + { + @Override + public Expr apply(List args) + { + return new ExprMacroTable.BaseScalarMacroFunctionExpr(this, args) + { + @Override + public ExprEval eval(ObjectBinding bindings) + { + throw DruidException.defensive("just for planner test"); + } + + @Nullable + @Override + public ExpressionType getOutputType(InputBindingInspector inspector) + { + return ExpressionType.STRING; + } + }; + } + + @Override + public String name() + { + return "dict_complex_to_string"; + } + }) + .build() + ); + } + } } diff --git a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionVirtualColumnTest.java b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionVirtualColumnTest.java index 451acb09661e..16e22c010806 100644 --- a/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionVirtualColumnTest.java +++ b/processing/src/test/java/org/apache/druid/segment/virtual/ExpressionVirtualColumnTest.java @@ -115,7 +115,7 @@ public class ExpressionVirtualColumnTest extends InitializedNullHandlingTest ImmutableMap.of( "x", 3L, "y", 4L, - "b", Arrays.asList(new String[]{"3", null, "5"}) + "b", Arrays.asList("3", null, "5") ) ); diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java index a68e4d67cf71..08c25a43d4ff 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteNestedDataQueryTest.java @@ -7558,4 +7558,36 @@ public void testGroupByAutoDouble() .build() ); } + + @Test + public void testToJsonString() + { + testQuery( + "SELECT TO_JSON_STRING(nester) FROM druid.nested GROUP BY 1", + ImmutableList.of( + GroupByQuery.builder() + .setDataSource(DATA_SOURCE) + .setInterval(querySegmentSpec(Filtration.eternity())) + .setGranularity(Granularities.ALL) + .setDimensions( + dimensions( + new DefaultDimensionSpec("v0", "d0", ColumnType.STRING) + ) + ) + .setVirtualColumns( + expressionVirtualColumn("v0", "to_json_string(\"nester\")", ColumnType.STRING) + ) + .setContext(QUERY_CONTEXT_DEFAULT) + .build() + ), + ImmutableList.of( + new Object[]{NullHandling.defaultStringValue()}, + new Object[]{"\"hello\""}, + new Object[]{"2"}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":\"hello\"}}"}, + new Object[]{"{\"array\":[\"a\",\"b\"],\"n\":{\"x\":1}}"} + ), + RowSignature.builder().add("EXPR$0", ColumnType.STRING).build() + ); + } }