Merge remote-tracking branch 'upstream/master' into cursor-rework

apache · Jul 26, 2024 · d22e1cf · d22e1cf
2 parents 3824754 + 9b76d13
commit d22e1cf
Show file tree

Hide file tree

Showing 274 changed files with 2,904 additions and 1,825 deletions.
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -52,7 +52,7 @@ jobs:
           stale-issue-label: stale
           stale-pr-label: stale
           exempt-issue-labels: 'Evergreen,Security,Bug,Proposal,Design Review,Improvement,Performance,Refactoring,Apache,Area - Automation/Static Analysis,Area - Batch Indexing,Area - Cache,Area - Deep Storage,Area - Dependencies,Area - Dependency Injection,Area - Dev,Area - Documentation,Area - Extension,Area - Kafka/Kinesis Indexing,Area - Lookups,Area - Metadata,Area - Metrics/Event Emitting,Area - Null Handling,Area - Operations,Area - Query UI,Area - Querying,Area - Router,Area - Segment Balancing/Coordination,Area - Segment Format and Ser/De,Area - SQL,Area - Testing,Area - Web Console,Area - Zookeeper/Curator,Compatibility,Contributions Welcome,Development Blocker,Ease of Use,Error handling,HTTP,Incompatible,Stable API'
-          exempt-pr-labels: 'Evergreen'
+          exempt-pr-labels: 'Evergreen,Area - Dependencies'
           exempt-milestones: true
           exempt-assignees: true
           ascending: true

diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByDeserializationBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/GroupByDeserializationBenchmark.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.benchmark;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.ImmutableMap;
+import org.apache.druid.common.config.NullHandling;
+import org.apache.druid.guice.NestedDataModule;
+import org.apache.druid.jackson.AggregatorsModule;
+import org.apache.druid.java.util.common.DateTimes;
+import org.apache.druid.java.util.common.Pair;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.granularity.Granularities;
+import org.apache.druid.query.QueryRunnerTestHelper;
+import org.apache.druid.query.aggregation.SerializablePairLongString;
+import org.apache.druid.query.aggregation.post.ConstantPostAggregator;
+import org.apache.druid.query.dimension.DefaultDimensionSpec;
+import org.apache.druid.query.dimension.DimensionSpec;
+import org.apache.druid.query.groupby.GroupByQuery;
+import org.apache.druid.query.groupby.GroupByQueryConfig;
+import org.apache.druid.query.groupby.GroupByQueryQueryToolChest;
+import org.apache.druid.query.groupby.ResultRow;
+import org.apache.druid.segment.TestHelper;
+import org.apache.druid.segment.column.ColumnType;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+@State(Scope.Benchmark)
+@Fork(value = 1)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+public class GroupByDeserializationBenchmark
+{
+
+  static {
+    NullHandling.initializeForTests();
+    NestedDataModule.registerHandlersAndSerde();
+    AggregatorsModule.registerComplexMetricsAndSerde();
+  }
+
+  @Param({"100", "1000"})
+  private int numDimensions;
+
+  @Param({"0", "0.25", "0.5", "0.75", "0.85", "0.95", "0.99", "1.0"})
+  private double primitiveToComplexDimensionRatio;
+
+  @Param({"json", "serializablePairLongString"})
+  private String complexDimensionType;
+
+  @Param({"true", "false"})
+  private boolean backwardCompatibility;
+
+  private GroupByQuery sqlQuery;
+  private String serializedRow;
+  private GroupByQueryQueryToolChest groupByQueryQueryToolChest;
+  private ObjectMapper decoratedMapper;
+
+  @Setup(Level.Trial)
+  public void setup() throws JsonProcessingException
+  {
+    final ObjectMapper undecoratedMapper = TestHelper.makeJsonMapper();
+    undecoratedMapper.registerModules(NestedDataModule.getJacksonModulesList());
+    undecoratedMapper.registerModule(new AggregatorsModule());
+    final Pair<GroupByQuery, String> sqlQueryAndResultRow = sqlQueryAndResultRow(
+        numDimensions,
+        primitiveToComplexDimensionRatio,
+        complexDimensionType,
+        undecoratedMapper
+    );
+    sqlQuery = sqlQueryAndResultRow.lhs;
+    serializedRow = sqlQueryAndResultRow.rhs;
+
+    groupByQueryQueryToolChest = new GroupByQueryQueryToolChest(
+        null,
+        () -> new GroupByQueryConfig()
+        {
+          @Override
+          public boolean isIntermediateResultAsMapCompat()
+          {
+            return backwardCompatibility;
+          }
+        },
+        null,
+        null
+    );
+
+    decoratedMapper = groupByQueryQueryToolChest.decorateObjectMapper(undecoratedMapper, sqlQuery);
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.AverageTime)
+  @OutputTimeUnit(TimeUnit.MICROSECONDS)
+  public void deserializeResultRows(Blackhole blackhole) throws JsonProcessingException
+  {
+    blackhole.consume(decoratedMapper.readValue(serializedRow, ResultRow.class));
+  }
+
+  private static Pair<GroupByQuery, String> sqlQueryAndResultRow(
+      final int numDimensions,
+      final double primitiveToComplexDimensionRatio,
+      final String complexDimensionType,
+      final ObjectMapper mapper
+  ) throws JsonProcessingException
+  {
+    final int numPrimitiveDimensions = (int) Math.floor(primitiveToComplexDimensionRatio * numDimensions);
+    final int numComplexDimensions = numDimensions - numPrimitiveDimensions;
+
+    final List<DimensionSpec> dimensions = new ArrayList<>();
+    final List<Object> rowList = new ArrayList<>();
+
+    // Add timestamp
+    rowList.add(DateTimes.of("2000").getMillis());
+
+    for (int i = 0; i < numPrimitiveDimensions; ++i) {
+      dimensions.add(
+          new DefaultDimensionSpec(
+              StringUtils.format("primitive%d", i),
+              StringUtils.format("primitive%d", i),
+              ColumnType.STRING
+          )
+      );
+      rowList.add("foo");
+    }
+
+    for (int i = 0; i < numComplexDimensions; ++i) {
+      dimensions.add(
+          new DefaultDimensionSpec(
+              StringUtils.format("complex%d", i),
+              StringUtils.format("complex%d", i),
+              ColumnType.ofComplex(complexDimensionType)
+          )
+      );
+
+      // Serialized version of this object is a valid value for both json and long-string pair dimensions
+      rowList.add(new SerializablePairLongString(1L, "test"));
+    }
+
+    // Add aggregator
+    rowList.add(100);
+
+    // Add post aggregator
+    rowList.add(10.0);
+
+    GroupByQuery query = GroupByQuery.builder()
+                                     .setDataSource("foo")
+                                     .setQuerySegmentSpec(QueryRunnerTestHelper.FULL_ON_INTERVAL_SPEC)
+                                     .setDimensions(dimensions)
+                                     .setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT)
+                                     .setPostAggregatorSpecs(Collections.singletonList(new ConstantPostAggregator(
+                                         "post",
+                                         10
+                                     )))
+                                     .setContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true))
+                                     .setGranularity(Granularities.DAY)
+                                     .build();
+
+    return Pair.of(query, mapper.writeValueAsString(rowList));
+  }
+}
diff --git a/docs/querying/arrays.md b/docs/querying/arrays.md
@@ -71,46 +71,10 @@ The following shows an example `dimensionsSpec` for native ingestion of the data
 
 ### SQL-based ingestion
 
-#### `arrayIngestMode`
-
-Arrays can be inserted with [SQL-based ingestion](../multi-stage-query/index.md) when you include the query context
-parameter `arrayIngestMode: array`.
-
-When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid array columns. This is recommended for new
-tables.
-
-When `arrayIngestMode` is `mvd`, SQL `VARCHAR ARRAY` are implicitly wrapped in [`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
-This causes them to be stored as [multi-value strings](multi-value-dimensions.md), using the same `STRING` column type
-as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be loaded under `arrayIngestMode: mvd`. This
-is the default behavior when `arrayIngestMode` is not provided in your query context, although the default behavior
-may change to `array` in a future release.
-
-When `arrayIngestMode` is `none`, Druid throws an exception when trying to store any type of arrays. This mode is most
-useful when set in the system default query context with `druid.query.default.context.arrayIngestMode = none`, in cases
-where the cluster administrator wants SQL query authors to explicitly provide one or the other in their query context.
-
-The following table summarizes the differences in SQL ARRAY handling between `arrayIngestMode: array` and
-`arrayIngestMode: mvd`.
-
-| SQL type | Stored type when `arrayIngestMode: array` | Stored type when `arrayIngestMode: mvd` (default) |
-|---|---|---|
-|`VARCHAR ARRAY`|`ARRAY<STRING>`|[multi-value `STRING`](multi-value-dimensions.md)|
-|`BIGINT ARRAY`|`ARRAY<LONG>`|not possible (validation error)|
-|`DOUBLE ARRAY`|`ARRAY<DOUBLE>`|not possible (validation error)|
-
-In either mode, you can explicitly wrap string arrays in `ARRAY_TO_MV` to cause them to be stored as
-[multi-value strings](multi-value-dimensions.md).
-
-When validating a SQL INSERT or REPLACE statement that contains arrays, Druid checks whether the statement would lead
-to mixing string arrays and multi-value strings in the same column. If this condition is detected, the statement fails
-validation unless the column is named under the `skipTypeVerification` context parameter. This parameter can be either
-a comma-separated list of column names, or a JSON array in string form. This validation is done to prevent accidentally
-mixing arrays and multi-value strings in the same column.
+Arrays can be inserted with [SQL-based ingestion](../multi-stage-query/index.md).
 
 #### Examples
 
-Set [`arrayIngestMode: array`](#arrayingestmode) in your query context to run the following examples.
-
 ```sql
 REPLACE INTO "array_example" OVERWRITE ALL
 WITH "ext" AS (
@@ -169,6 +133,35 @@ GROUP BY 1,2,3,4,5
 PARTITIONED BY DAY
 ```
 
+#### `arrayIngestMode`
+
+For seamless backwards compatible behavior with Druid versions older than 31, there is an `arrayIngestMode` query context flag.
+
+When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid array columns. This is recommended for new
+tables and the default configuration for Druid 31 and newer.
+
+When `arrayIngestMode` is `mvd` (legacy), SQL `VARCHAR ARRAY` are implicitly wrapped in [`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
+This causes them to be stored as [multi-value strings](multi-value-dimensions.md), using the same `STRING` column type
+as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be loaded under `arrayIngestMode: mvd`. This
+mode is not recommended and will be removed in a future release, but provided for backwards compatibility.
+
+The following table summarizes the differences in SQL ARRAY handling between `arrayIngestMode: array` and
+`arrayIngestMode: mvd`.
+
+| SQL type | Stored type when `arrayIngestMode: array` (default) | Stored type when `arrayIngestMode: mvd` |
+|---|---|---|
+|`VARCHAR ARRAY`|`ARRAY<STRING>`|[multi-value `STRING`](multi-value-dimensions.md)|
+|`BIGINT ARRAY`|`ARRAY<LONG>`|not possible (validation error)|
+|`DOUBLE ARRAY`|`ARRAY<DOUBLE>`|not possible (validation error)|
+
+In either mode, you can explicitly wrap string arrays in `ARRAY_TO_MV` to cause them to be stored as
+[multi-value strings](multi-value-dimensions.md).
+
+When validating a SQL INSERT or REPLACE statement that contains arrays, Druid checks whether the statement would lead
+to mixing string arrays and multi-value strings in the same column. If this condition is detected, the statement fails
+validation unless the column is named under the `skipTypeVerification` context parameter. This parameter can be either
+a comma-separated list of column names, or a JSON array in string form. This validation is done to prevent accidentally
+mixing arrays and multi-value strings in the same column.
 
 ## Querying arrays
 
@@ -284,9 +277,9 @@ Avoid confusing string arrays with [multi-value dimensions](multi-value-dimensio
 
 Use care during ingestion to ensure you get the type you want.
 
-To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays and set the context parameter `"arrayIngestMode": "array"`. Arrays may contain strings or numbers.
+To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays. Arrays may contain strings or numbers.
 
-To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions in any `arrayIngestMode`. Multi-value dimensions can only contain strings.
+To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions. Multi-value dimensions can only contain strings.
 
 You can tell which type you have by checking the `INFORMATION_SCHEMA.COLUMNS` table, using a query like:
 

diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md
@@ -507,9 +507,9 @@ Avoid confusing string arrays with [multi-value dimensions](multi-value-dimensio
 
 Use care during ingestion to ensure you get the type you want.
 
-To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays and set the context parameter [`"arrayIngestMode": "array"`](arrays.md#arrayingestmode). Arrays may contain strings or numbers.
+To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays. Arrays may contain strings or numbers.
 
-To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions in any [`arrayIngestMode`](arrays.md#arrayingestmode). Multi-value dimensions can only contain strings.
+To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion). Multi-value dimensions can only contain strings.
 
 You can tell which type you have by checking the `INFORMATION_SCHEMA.COLUMNS` table, using a query like: