Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into cursor-rework
Browse files Browse the repository at this point in the history
  • Loading branch information
clintropolis committed Jul 26, 2024
2 parents 3824754 + 9b76d13 commit d22e1cf
Show file tree
Hide file tree
Showing 274 changed files with 2,904 additions and 1,825 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/stale.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
stale-issue-label: stale
stale-pr-label: stale
exempt-issue-labels: 'Evergreen,Security,Bug,Proposal,Design Review,Improvement,Performance,Refactoring,Apache,Area - Automation/Static Analysis,Area - Batch Indexing,Area - Cache,Area - Deep Storage,Area - Dependencies,Area - Dependency Injection,Area - Dev,Area - Documentation,Area - Extension,Area - Kafka/Kinesis Indexing,Area - Lookups,Area - Metadata,Area - Metrics/Event Emitting,Area - Null Handling,Area - Operations,Area - Query UI,Area - Querying,Area - Router,Area - Segment Balancing/Coordination,Area - Segment Format and Ser/De,Area - SQL,Area - Testing,Area - Web Console,Area - Zookeeper/Curator,Compatibility,Contributions Welcome,Development Blocker,Ease of Use,Error handling,HTTP,Incompatible,Stable API'
exempt-pr-labels: 'Evergreen'
exempt-pr-labels: 'Evergreen,Area - Dependencies'
exempt-milestones: true
exempt-assignees: true
ascending: true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.benchmark;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableMap;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.guice.NestedDataModule;
import org.apache.druid.jackson.AggregatorsModule;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.query.QueryRunnerTestHelper;
import org.apache.druid.query.aggregation.SerializablePairLongString;
import org.apache.druid.query.aggregation.post.ConstantPostAggregator;
import org.apache.druid.query.dimension.DefaultDimensionSpec;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.groupby.GroupByQueryConfig;
import org.apache.druid.query.groupby.GroupByQueryQueryToolChest;
import org.apache.druid.query.groupby.ResultRow;
import org.apache.druid.segment.TestHelper;
import org.apache.druid.segment.column.ColumnType;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;

@State(Scope.Benchmark)
@Fork(value = 1)
@Warmup(iterations = 3)
@Measurement(iterations = 5)
public class GroupByDeserializationBenchmark
{

static {
NullHandling.initializeForTests();
NestedDataModule.registerHandlersAndSerde();
AggregatorsModule.registerComplexMetricsAndSerde();
}

@Param({"100", "1000"})
private int numDimensions;

@Param({"0", "0.25", "0.5", "0.75", "0.85", "0.95", "0.99", "1.0"})
private double primitiveToComplexDimensionRatio;

@Param({"json", "serializablePairLongString"})
private String complexDimensionType;

@Param({"true", "false"})
private boolean backwardCompatibility;

private GroupByQuery sqlQuery;
private String serializedRow;
private GroupByQueryQueryToolChest groupByQueryQueryToolChest;
private ObjectMapper decoratedMapper;

@Setup(Level.Trial)
public void setup() throws JsonProcessingException
{
final ObjectMapper undecoratedMapper = TestHelper.makeJsonMapper();
undecoratedMapper.registerModules(NestedDataModule.getJacksonModulesList());
undecoratedMapper.registerModule(new AggregatorsModule());
final Pair<GroupByQuery, String> sqlQueryAndResultRow = sqlQueryAndResultRow(
numDimensions,
primitiveToComplexDimensionRatio,
complexDimensionType,
undecoratedMapper
);
sqlQuery = sqlQueryAndResultRow.lhs;
serializedRow = sqlQueryAndResultRow.rhs;

groupByQueryQueryToolChest = new GroupByQueryQueryToolChest(
null,
() -> new GroupByQueryConfig()
{
@Override
public boolean isIntermediateResultAsMapCompat()
{
return backwardCompatibility;
}
},
null,
null
);

decoratedMapper = groupByQueryQueryToolChest.decorateObjectMapper(undecoratedMapper, sqlQuery);
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public void deserializeResultRows(Blackhole blackhole) throws JsonProcessingException
{
blackhole.consume(decoratedMapper.readValue(serializedRow, ResultRow.class));
}

private static Pair<GroupByQuery, String> sqlQueryAndResultRow(
final int numDimensions,
final double primitiveToComplexDimensionRatio,
final String complexDimensionType,
final ObjectMapper mapper
) throws JsonProcessingException
{
final int numPrimitiveDimensions = (int) Math.floor(primitiveToComplexDimensionRatio * numDimensions);
final int numComplexDimensions = numDimensions - numPrimitiveDimensions;

final List<DimensionSpec> dimensions = new ArrayList<>();
final List<Object> rowList = new ArrayList<>();

// Add timestamp
rowList.add(DateTimes.of("2000").getMillis());

for (int i = 0; i < numPrimitiveDimensions; ++i) {
dimensions.add(
new DefaultDimensionSpec(
StringUtils.format("primitive%d", i),
StringUtils.format("primitive%d", i),
ColumnType.STRING
)
);
rowList.add("foo");
}

for (int i = 0; i < numComplexDimensions; ++i) {
dimensions.add(
new DefaultDimensionSpec(
StringUtils.format("complex%d", i),
StringUtils.format("complex%d", i),
ColumnType.ofComplex(complexDimensionType)
)
);

// Serialized version of this object is a valid value for both json and long-string pair dimensions
rowList.add(new SerializablePairLongString(1L, "test"));
}

// Add aggregator
rowList.add(100);

// Add post aggregator
rowList.add(10.0);

GroupByQuery query = GroupByQuery.builder()
.setDataSource("foo")
.setQuerySegmentSpec(QueryRunnerTestHelper.FULL_ON_INTERVAL_SPEC)
.setDimensions(dimensions)
.setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT)
.setPostAggregatorSpecs(Collections.singletonList(new ConstantPostAggregator(
"post",
10
)))
.setContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true))
.setGranularity(Granularities.DAY)
.build();

return Pair.of(query, mapper.writeValueAsString(rowList));
}
}
71 changes: 32 additions & 39 deletions docs/querying/arrays.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,46 +71,10 @@ The following shows an example `dimensionsSpec` for native ingestion of the data

### SQL-based ingestion

#### `arrayIngestMode`

Arrays can be inserted with [SQL-based ingestion](../multi-stage-query/index.md) when you include the query context
parameter `arrayIngestMode: array`.

When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid array columns. This is recommended for new
tables.

When `arrayIngestMode` is `mvd`, SQL `VARCHAR ARRAY` are implicitly wrapped in [`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
This causes them to be stored as [multi-value strings](multi-value-dimensions.md), using the same `STRING` column type
as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be loaded under `arrayIngestMode: mvd`. This
is the default behavior when `arrayIngestMode` is not provided in your query context, although the default behavior
may change to `array` in a future release.

When `arrayIngestMode` is `none`, Druid throws an exception when trying to store any type of arrays. This mode is most
useful when set in the system default query context with `druid.query.default.context.arrayIngestMode = none`, in cases
where the cluster administrator wants SQL query authors to explicitly provide one or the other in their query context.

The following table summarizes the differences in SQL ARRAY handling between `arrayIngestMode: array` and
`arrayIngestMode: mvd`.

| SQL type | Stored type when `arrayIngestMode: array` | Stored type when `arrayIngestMode: mvd` (default) |
|---|---|---|
|`VARCHAR ARRAY`|`ARRAY<STRING>`|[multi-value `STRING`](multi-value-dimensions.md)|
|`BIGINT ARRAY`|`ARRAY<LONG>`|not possible (validation error)|
|`DOUBLE ARRAY`|`ARRAY<DOUBLE>`|not possible (validation error)|

In either mode, you can explicitly wrap string arrays in `ARRAY_TO_MV` to cause them to be stored as
[multi-value strings](multi-value-dimensions.md).

When validating a SQL INSERT or REPLACE statement that contains arrays, Druid checks whether the statement would lead
to mixing string arrays and multi-value strings in the same column. If this condition is detected, the statement fails
validation unless the column is named under the `skipTypeVerification` context parameter. This parameter can be either
a comma-separated list of column names, or a JSON array in string form. This validation is done to prevent accidentally
mixing arrays and multi-value strings in the same column.
Arrays can be inserted with [SQL-based ingestion](../multi-stage-query/index.md).

#### Examples

Set [`arrayIngestMode: array`](#arrayingestmode) in your query context to run the following examples.

```sql
REPLACE INTO "array_example" OVERWRITE ALL
WITH "ext" AS (
Expand Down Expand Up @@ -169,6 +133,35 @@ GROUP BY 1,2,3,4,5
PARTITIONED BY DAY
```

#### `arrayIngestMode`

For seamless backwards compatible behavior with Druid versions older than 31, there is an `arrayIngestMode` query context flag.

When `arrayIngestMode` is `array`, SQL ARRAY types are stored using Druid array columns. This is recommended for new
tables and the default configuration for Druid 31 and newer.

When `arrayIngestMode` is `mvd` (legacy), SQL `VARCHAR ARRAY` are implicitly wrapped in [`ARRAY_TO_MV`](sql-functions.md#array_to_mv).
This causes them to be stored as [multi-value strings](multi-value-dimensions.md), using the same `STRING` column type
as regular scalar strings. SQL `BIGINT ARRAY` and `DOUBLE ARRAY` cannot be loaded under `arrayIngestMode: mvd`. This
mode is not recommended and will be removed in a future release, but provided for backwards compatibility.

The following table summarizes the differences in SQL ARRAY handling between `arrayIngestMode: array` and
`arrayIngestMode: mvd`.

| SQL type | Stored type when `arrayIngestMode: array` (default) | Stored type when `arrayIngestMode: mvd` |
|---|---|---|
|`VARCHAR ARRAY`|`ARRAY<STRING>`|[multi-value `STRING`](multi-value-dimensions.md)|
|`BIGINT ARRAY`|`ARRAY<LONG>`|not possible (validation error)|
|`DOUBLE ARRAY`|`ARRAY<DOUBLE>`|not possible (validation error)|

In either mode, you can explicitly wrap string arrays in `ARRAY_TO_MV` to cause them to be stored as
[multi-value strings](multi-value-dimensions.md).

When validating a SQL INSERT or REPLACE statement that contains arrays, Druid checks whether the statement would lead
to mixing string arrays and multi-value strings in the same column. If this condition is detected, the statement fails
validation unless the column is named under the `skipTypeVerification` context parameter. This parameter can be either
a comma-separated list of column names, or a JSON array in string form. This validation is done to prevent accidentally
mixing arrays and multi-value strings in the same column.

## Querying arrays

Expand Down Expand Up @@ -284,9 +277,9 @@ Avoid confusing string arrays with [multi-value dimensions](multi-value-dimensio

Use care during ingestion to ensure you get the type you want.

To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays and set the context parameter `"arrayIngestMode": "array"`. Arrays may contain strings or numbers.
To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays. Arrays may contain strings or numbers.

To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions in any `arrayIngestMode`. Multi-value dimensions can only contain strings.
To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions. Multi-value dimensions can only contain strings.

You can tell which type you have by checking the `INFORMATION_SCHEMA.COLUMNS` table, using a query like:

Expand Down
4 changes: 2 additions & 2 deletions docs/querying/multi-value-dimensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -507,9 +507,9 @@ Avoid confusing string arrays with [multi-value dimensions](multi-value-dimensio

Use care during ingestion to ensure you get the type you want.

To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays and set the context parameter [`"arrayIngestMode": "array"`](arrays.md#arrayingestmode). Arrays may contain strings or numbers.
To get arrays when performing an ingestion using JSON ingestion specs, such as [native batch](../ingestion/native-batch.md) or streaming ingestion such as with [Apache Kafka](../ingestion/kafka-ingestion.md), use dimension type `auto` or enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), write a query that generates arrays. Arrays may contain strings or numbers.

To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion), which ensures you get multi-value dimensions in any [`arrayIngestMode`](arrays.md#arrayingestmode). Multi-value dimensions can only contain strings.
To get multi-value dimensions when performing an ingestion using JSON ingestion specs, use dimension type `string` and do not enable `useSchemaDiscovery`. When performing a [SQL-based ingestion](../multi-stage-query/index.md), wrap arrays in [`ARRAY_TO_MV`](multi-value-dimensions.md#sql-based-ingestion). Multi-value dimensions can only contain strings.

You can tell which type you have by checking the `INFORMATION_SCHEMA.COLUMNS` table, using a query like:

Expand Down
Loading

0 comments on commit d22e1cf

Please sign in to comment.