diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java
index b7bfe56ff02c..a33dbc0dcfc3 100644
--- a/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/benchmark/query/ScanBenchmark.java
@@ -262,12 +262,8 @@ public void setup()
rowsPerSegment
);
- final ScanQueryConfig config = new ScanQueryConfig().setLegacy(false);
factory = new ScanQueryRunnerFactory(
- new ScanQueryQueryToolChest(
- config,
- DefaultGenericQueryMetricsFactory.instance()
- ),
+ new ScanQueryQueryToolChest(DefaultGenericQueryMetricsFactory.instance()),
new ScanQueryEngine(),
new ScanQueryConfig()
);
diff --git a/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java b/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java
index 37f7b665927c..98c27c4b2b8c 100644
--- a/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java
+++ b/benchmarks/src/test/java/org/apache/druid/server/coordinator/NewestSegmentFirstPolicyBenchmark.java
@@ -141,7 +141,7 @@ public void setup()
@Benchmark
public void measureNewestSegmentFirstPolicy(Blackhole blackhole)
{
- final CompactionSegmentIterator iterator = policy.reset(compactionConfigs, dataSources, Collections.emptyMap());
+ final CompactionSegmentIterator iterator = policy.createIterator(compactionConfigs, dataSources, Collections.emptyMap());
for (int i = 0; i < numCompactionTaskSlots && iterator.hasNext(); i++) {
blackhole.consume(iterator.next());
}
diff --git a/docs/api-reference/sql-ingestion-api.md b/docs/api-reference/sql-ingestion-api.md
index 1b587cf4bac1..275be9e63dc8 100644
--- a/docs/api-reference/sql-ingestion-api.md
+++ b/docs/api-reference/sql-ingestion-api.md
@@ -474,7 +474,6 @@ The response shows an example report for a query.
"agent_type",
"timestamp"
],
- "legacy": false,
"context": {
"finalize": false,
"finalizeAggregations": false,
diff --git a/docs/api-reference/tasks-api.md b/docs/api-reference/tasks-api.md
index dbd8fa7d5425..d94be5b0c5fd 100644
--- a/docs/api-reference/tasks-api.md
+++ b/docs/api-reference/tasks-api.md
@@ -914,13 +914,10 @@ Host: http://ROUTER_IP:ROUTER_PORT
### Get task segments
:::info
- This API is deprecated and will be removed in future releases.
+ This API is not supported anymore and always returns a 404 response.
+ Use the metric `segment/added/bytes` instead to identify the segment IDs committed by a task.
:::
-Retrieves information about segments generated by the task given the task ID. To hit this endpoint, make sure to enable the audit log config on the Overlord with `druid.indexer.auditLog.enabled = true`.
-
-In addition to enabling audit logs, configure a cleanup strategy to prevent overloading the metadata store with old audit logs which may cause performance issues. To enable automated cleanup of audit logs on the Coordinator, set `druid.coordinator.kill.audit.on`. You may also manually export the audit logs to external storage. For more information, see [Audit records](../operations/clean-metadata-store.md#audit-records).
-
#### URL
`GET` `/druid/indexer/v1/task/{taskId}/segments`
@@ -929,12 +926,14 @@ In addition to enabling audit logs, configure a cleanup strategy to prevent over
-
+
-
-
-*Successfully retrieved task segments*
+```json
+{
+ "error": "Segment IDs committed by a task action are not persisted anymore. Use the metric 'segment/added/bytes' to identify the segments created by a task."
+}
+```
diff --git a/docs/configuration/extensions.md b/docs/configuration/extensions.md
index d396bc29000d..bc7a05a4ae23 100644
--- a/docs/configuration/extensions.md
+++ b/docs/configuration/extensions.md
@@ -80,7 +80,7 @@ All of these community extensions can be downloaded using [pull-deps](../operati
|aliyun-oss-extensions|Aliyun OSS deep storage |[link](../development/extensions-contrib/aliyun-oss-extensions.md)|
|ambari-metrics-emitter|Ambari Metrics Emitter |[link](../development/extensions-contrib/ambari-metrics-emitter.md)|
|druid-cassandra-storage|Apache Cassandra deep storage.|[link](../development/extensions-contrib/cassandra.md)|
-|druid-cloudfiles-extensions|Rackspace Cloudfiles deep storage and firehose.|[link](../development/extensions-contrib/cloudfiles.md)|
+|druid-cloudfiles-extensions|Rackspace Cloudfiles deep storage.|[link](../development/extensions-contrib/cloudfiles.md)|
|druid-compressed-bigdecimal|Compressed Big Decimal Type | [link](../development/extensions-contrib/compressed-big-decimal.md)|
|druid-ddsketch|Support for DDSketch approximate quantiles based on [DDSketch](https://github.com/datadog/sketches-java) | [link](../development/extensions-contrib/ddsketch-quantiles.md)|
|druid-deltalake-extensions|Support for ingesting Delta Lake tables.|[link](../development/extensions-contrib/delta-lake.md)|
diff --git a/docs/configuration/index.md b/docs/configuration/index.md
index c627e9fd7f07..3b3c2711d3b4 100644
--- a/docs/configuration/index.md
+++ b/docs/configuration/index.md
@@ -395,7 +395,6 @@ Metric monitoring is an essential part of Druid operations. The following monito
|`org.apache.druid.java.util.metrics.CgroupCpuSetMonitor`|Reports CPU core/HT and memory node allocations as per the `cpuset` cgroup.|
|`org.apache.druid.java.util.metrics.CgroupDiskMonitor`|Reports disk statistic as per the blkio cgroup.|
|`org.apache.druid.java.util.metrics.CgroupMemoryMonitor`|Reports memory statistic as per the memory cgroup.|
-|`org.apache.druid.server.metrics.EventReceiverFirehoseMonitor`|Reports how many events have been queued in the EventReceiverFirehose.|
|`org.apache.druid.server.metrics.HistoricalMetricsMonitor`|Reports statistics on Historical services. Available only on Historical services.|
|`org.apache.druid.server.metrics.SegmentStatsMonitor` | **EXPERIMENTAL** Reports statistics about segments on Historical services. Available only on Historical services. Not to be used when lazy loading is configured.|
|`org.apache.druid.server.metrics.QueryCountStatsMonitor`|Reports how many queries have been successful/failed/interrupted.|
@@ -607,7 +606,7 @@ the [HDFS input source](../ingestion/input-sources.md#hdfs-input-source).
|Property|Possible values|Description|Default|
|--------|---------------|-----------|-------|
-|`druid.ingestion.hdfs.allowedProtocols`|List of protocols|Allowed protocols for the HDFS input source and HDFS firehose.|`["hdfs"]`|
+|`druid.ingestion.hdfs.allowedProtocols`|List of protocols|Allowed protocols for the HDFS input source.|`["hdfs"]`|
#### HTTP input source
@@ -616,7 +615,7 @@ the [HTTP input source](../ingestion/input-sources.md#http-input-source).
|Property|Possible values|Description|Default|
|--------|---------------|-----------|-------|
-|`druid.ingestion.http.allowedProtocols`|List of protocols|Allowed protocols for the HTTP input source and HTTP firehose.|`["http", "https"]`|
+|`druid.ingestion.http.allowedProtocols`|List of protocols|Allowed protocols for the HTTP input source.|`["http", "https"]`|
### External data access security configuration
@@ -1501,7 +1500,6 @@ Additional Peon configs include:
|`druid.peon.mode`|One of `local` or `remote`. Setting this property to `local` means you intend to run the Peon as a standalone process which is not recommended.|`remote`|
|`druid.indexer.task.baseDir`|Base temporary working directory.|`System.getProperty("java.io.tmpdir")`|
|`druid.indexer.task.baseTaskDir`|Base temporary working directory for tasks.|`${druid.indexer.task.baseDir}/persistent/task`|
-|`druid.indexer.task.batchProcessingMode`| Batch ingestion tasks have three operating modes to control construction and tracking for intermediary segments: `OPEN_SEGMENTS`, `CLOSED_SEGMENTS`, and `CLOSED_SEGMENT_SINKS`. `OPEN_SEGMENTS` uses the streaming ingestion code path and performs a `mmap` on intermediary segments to build a timeline to make these segments available to realtime queries. Batch ingestion doesn't require intermediary segments, so the default mode, `CLOSED_SEGMENTS`, eliminates `mmap` of intermediary segments. `CLOSED_SEGMENTS` mode still tracks the entire set of segments in heap. The `CLOSED_SEGMENTS_SINKS` mode is the most aggressive configuration and should have the smallest memory footprint. It eliminates in-memory tracking and `mmap` of intermediary segments produced during segment creation. `CLOSED_SEGMENTS_SINKS` mode isn't as well tested as other modes so is currently considered experimental. You can use `OPEN_SEGMENTS` mode if problems occur with the 2 newer modes. |`CLOSED_SEGMENTS`|
|`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|`org.apache.hadoop:hadoop-client-api:3.3.6`, `org.apache.hadoop:hadoop-client-runtime:3.3.6`|
|`druid.indexer.task.defaultRowFlushBoundary`|Highest row count before persisting to disk. Used for indexing generating tasks.|75000|
|`druid.indexer.task.directoryLockTimeout`|Wait this long for zombie Peons to exit before giving up on their replacements.|PT10M|
diff --git a/docs/development/extensions-contrib/cloudfiles.md b/docs/development/extensions-contrib/cloudfiles.md
index 83a1d0c7e10b..d4e7592ee7f7 100644
--- a/docs/development/extensions-contrib/cloudfiles.md
+++ b/docs/development/extensions-contrib/cloudfiles.md
@@ -40,59 +40,3 @@ To use this Apache Druid extension, [include](../../configuration/extensions.md#
|`druid.cloudfiles.apiKey`||Rackspace Cloud API key.|Must be set.|
|`druid.cloudfiles.provider`|rackspace-cloudfiles-us,rackspace-cloudfiles-uk|Name of the provider depending on the region.|Must be set.|
|`druid.cloudfiles.useServiceNet`|true,false|Whether to use the internal service net.|true|
-
-## Firehose
-
-
-
-#### StaticCloudFilesFirehose
-
-This firehose ingests events, similar to the StaticAzureBlobStoreFirehose, but from Rackspace's Cloud Files.
-
-Data is newline delimited, with one JSON object per line and parsed as per the `InputRowParser` configuration.
-
-The storage account is shared with the one used for Rackspace's Cloud Files deep storage functionality, but blobs can be in a different region and container.
-
-As with the Azure blobstore, it is assumed to be gzipped if the extension ends in .gz
-
-This firehose is _splittable_ and can be used by [native parallel index tasks](../../ingestion/native-batch.md).
-Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
-
-Sample spec:
-
-```json
-"firehose" : {
- "type" : "static-cloudfiles",
- "blobs": [
- {
- "region": "DFW"
- "container": "container",
- "path": "/path/to/your/file.json"
- },
- {
- "region": "ORD"
- "container": "anothercontainer",
- "path": "/another/path.json"
- }
- ]
-}
-```
-This firehose provides caching and prefetching features. In IndexTask, a firehose can be read twice if intervals or
-shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow.
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|type|This should be `static-cloudfiles`.|N/A|yes|
-|blobs|JSON array of Cloud Files blobs.|N/A|yes|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache.|1073741824|no|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no|
-|fetchTimeout|Timeout for fetching a Cloud Files object.|60000|no|
-|maxFetchRetry|Maximum retry for fetching a Cloud Files object.|3|no|
-
-Cloud Files Blobs:
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|container|Name of the Cloud Files container|N/A|yes|
-|path|The path where data is located.|N/A|yes|
diff --git a/docs/development/extensions-core/postgresql.md b/docs/development/extensions-core/postgresql.md
index e0d9337b2ccb..919bf372b844 100644
--- a/docs/development/extensions-core/postgresql.md
+++ b/docs/development/extensions-core/postgresql.md
@@ -87,7 +87,7 @@ In most cases, the configuration options map directly to the [postgres JDBC conn
| `druid.metadata.postgres.ssl.sslPasswordCallback` | The classname of the SSL password provider. | none | no |
| `druid.metadata.postgres.dbTableSchema` | druid meta table schema | `public` | no |
-### PostgreSQL Firehose
+### PostgreSQL InputSource
The PostgreSQL extension provides an implementation of an [SQL input source](../../ingestion/input-sources.md) which can be used to ingest data into Druid from a PostgreSQL database.
diff --git a/docs/development/overview.md b/docs/development/overview.md
index 5ff77af07cf4..11c67ddfa2c4 100644
--- a/docs/development/overview.md
+++ b/docs/development/overview.md
@@ -53,8 +53,17 @@ Most of the coordination logic for (real-time) ingestion is in the Druid indexin
## Real-time Ingestion
-Druid loads data through `FirehoseFactory.java` classes. Firehoses often wrap other firehoses, where, similar to the design of the
-query runners, each firehose adds a layer of logic, and the persist and hand-off logic is in `RealtimePlumber.java`.
+Druid streaming tasks are based on the 'seekable stream' classes such as `SeekableStreamSupervisor.java`,
+`SeekableStreamIndexTask.java`, and `SeekableStreamIndexTaskRunner.java`. The data processing happens through
+`StreamAppenderator.java`, and the persist and hand-off logic is in `StreamAppenderatorDriver.java`.
+
+## Native Batch Ingestion
+
+Druid native batch ingestion main task types are based on `AbstractBatchTask.java` and `AbstractBatchSubtask.java`.
+Parallel processing uses `ParallelIndexSupervisorTask.java`, which spawns subtasks to perform various operations such
+as data analysis and partitioning depending on the task specification. Segment generation happens in
+`SinglePhaseSubTask.java`, `PartialHashSegmentGenerateTask.java`, or `PartialRangeSegmentGenerateTask.java` through
+`BatchAppenderator`, and the persist and hand-off logic is in `BatchAppenderatorDriver.java`.
## Hadoop-based Batch Ingestion
diff --git a/docs/ingestion/native-batch-firehose.md b/docs/ingestion/native-batch-firehose.md
index db2b3e8779e6..16e9634ff297 100644
--- a/docs/ingestion/native-batch-firehose.md
+++ b/docs/ingestion/native-batch-firehose.md
@@ -24,319 +24,5 @@ sidebar_label: "Firehose (deprecated)"
-->
:::info
- Firehose ingestion is deprecated. See [Migrate from firehose to input source ingestion](../operations/migrate-from-firehose-ingestion.md) for instructions on migrating from firehose ingestion to using native batch ingestion input sources.
+ Firehose ingestion has been removed in Druid 26.0. See [Migrate from firehose to input source ingestion](../operations/migrate-from-firehose-ingestion.md) for instructions on migrating from firehose ingestion to using native batch ingestion input sources.
:::
-
-There are several firehoses readily available in Druid, some are meant for examples, others can be used directly in a production environment.
-
-## StaticS3Firehose
-
-You need to include the [`druid-s3-extensions`](../development/extensions-core/s3.md) as an extension to use the StaticS3Firehose.
-
-This firehose ingests events from a predefined list of S3 objects.
-This firehose is _splittable_ and can be used by the [Parallel task](./native-batch.md).
-Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
-
-Sample spec:
-
-```json
-"firehose" : {
- "type" : "static-s3",
- "uris": ["s3://foo/bar/file.gz", "s3://bar/foo/file2.gz"]
-}
-```
-
-This firehose provides caching and prefetching features. In the Simple task, a firehose can be read twice if intervals or
-shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow.
-Note that prefetching or caching isn't that useful in the Parallel task.
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|type|This should be `static-s3`.|None|yes|
-|uris|JSON array of URIs where s3 files to be ingested are located.|None|`uris` or `prefixes` must be set|
-|prefixes|JSON array of URI prefixes for the locations of s3 files to be ingested.|None|`uris` or `prefixes` must be set|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no|
-|prefetchTriggerBytes|Threshold to trigger prefetching s3 objects.|maxFetchCapacityBytes / 2|no|
-|fetchTimeout|Timeout for fetching an s3 object.|60000|no|
-|maxFetchRetry|Maximum retry for fetching an s3 object.|3|no|
-
-## StaticGoogleBlobStoreFirehose
-
-You need to include the [`druid-google-extensions`](../development/extensions-core/google.md) as an extension to use the StaticGoogleBlobStoreFirehose.
-
-This firehose ingests events, similar to the StaticS3Firehose, but from an Google Cloud Store.
-
-As with the S3 blobstore, it is assumed to be gzipped if the extension ends in .gz
-
-This firehose is _splittable_ and can be used by the [Parallel task](./native-batch.md).
-Since each split represents an object in this firehose, each worker task of `index_parallel` will read an object.
-
-Sample spec:
-
-```json
-"firehose" : {
- "type" : "static-google-blobstore",
- "blobs": [
- {
- "bucket": "foo",
- "path": "/path/to/your/file.json"
- },
- {
- "bucket": "bar",
- "path": "/another/path.json"
- }
- ]
-}
-```
-
-This firehose provides caching and prefetching features. In the Simple task, a firehose can be read twice if intervals or
-shardSpecs are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scan of objects is slow.
-Note that prefetching or caching isn't that useful in the Parallel task.
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|type|This should be `static-google-blobstore`.|None|yes|
-|blobs|JSON array of Google Blobs.|None|yes|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|no|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|no|
-|prefetchTriggerBytes|Threshold to trigger prefetching Google Blobs.|maxFetchCapacityBytes / 2|no|
-|fetchTimeout|Timeout for fetching a Google Blob.|60000|no|
-|maxFetchRetry|Maximum retry for fetching a Google Blob.|3|no|
-
-Google Blobs:
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|bucket|Name of the Google Cloud bucket|None|yes|
-|path|The path where data is located.|None|yes|
-
-## HDFSFirehose
-
-You need to include the [`druid-hdfs-storage`](../development/extensions-core/hdfs.md) as an extension to use the HDFSFirehose.
-
-This firehose ingests events from a predefined list of files from the HDFS storage.
-This firehose is _splittable_ and can be used by the [Parallel task](./native-batch.md).
-Since each split represents an HDFS file, each worker task of `index_parallel` will read files.
-
-Sample spec:
-
-```json
-"firehose" : {
- "type" : "hdfs",
- "paths": "/foo/bar,/foo/baz"
-}
-```
-
-This firehose provides caching and prefetching features. During native batch indexing, a firehose can be read twice if
-`intervals` are not specified, and, in this case, caching can be useful. Prefetching is preferred when direct scanning
-of files is slow.
-Note that prefetching or caching isn't that useful in the Parallel task.
-
-|Property|Description|Default|
-|--------|-----------|-------|
-|type|This should be `hdfs`.|none (required)|
-|paths|HDFS paths. Can be either a JSON array or comma-separated string of paths. Wildcards like `*` are supported in these paths.|none (required)|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|
-|prefetchTriggerBytes|Threshold to trigger prefetching files.|maxFetchCapacityBytes / 2|
-|fetchTimeout|Timeout for fetching each file.|60000|
-|maxFetchRetry|Maximum number of retries for fetching each file.|3|
-
-You can also ingest from other storage using the HDFS firehose if the HDFS client supports that storage.
-However, if you want to ingest from cloud storage, consider using the service-specific input source for your data storage.
-If you want to use a non-hdfs protocol with the HDFS firehose, you need to include the protocol you want
-in `druid.ingestion.hdfs.allowedProtocols`. See [HDFS firehose security configuration](../configuration/index.md#hdfs-input-source) for more details.
-
-## LocalFirehose
-
-This Firehose can be used to read the data from files on local disk, and is mainly intended for proof-of-concept testing, and works with `string` typed parsers.
-This Firehose is _splittable_ and can be used by [native parallel index tasks](native-batch.md).
-Since each split represents a file in this Firehose, each worker task of `index_parallel` will read a file.
-A sample local Firehose spec is shown below:
-
-```json
-{
- "type": "local",
- "filter" : "*.csv",
- "baseDir": "/data/directory"
-}
-```
-
-|property|description|required?|
-|--------|-----------|---------|
-|type|This should be "local".|yes|
-|filter|A wildcard filter for files. See [here](http://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter) for more information.|yes|
-|baseDir|directory to search recursively for files to be ingested. |yes|
-
-
-
-## HttpFirehose
-
-This Firehose can be used to read the data from remote sites via HTTP, and works with `string` typed parsers.
-This Firehose is _splittable_ and can be used by [native parallel index tasks](native-batch.md).
-Since each split represents a file in this Firehose, each worker task of `index_parallel` will read a file.
-A sample HTTP Firehose spec is shown below:
-
-```json
-{
- "type": "http",
- "uris": ["http://example.com/uri1", "http://example2.com/uri2"]
-}
-```
-
-You can only use protocols listed in the `druid.ingestion.http.allowedProtocols` property as HTTP firehose input sources.
-The `http` and `https` protocols are allowed by default. See [HTTP firehose security configuration](../configuration/index.md#http-input-source) for more details.
-
-The below configurations can be optionally used if the URIs specified in the spec require a Basic Authentication Header.
-Omitting these fields from your spec will result in HTTP requests with no Basic Authentication Header.
-
-|property|description|default|
-|--------|-----------|-------|
-|httpAuthenticationUsername|Username to use for authentication with specified URIs|None|
-|httpAuthenticationPassword|PasswordProvider to use with specified URIs|None|
-
-Example with authentication fields using the DefaultPassword provider (this requires the password to be in the ingestion spec):
-
-```json
-{
- "type": "http",
- "uris": ["http://example.com/uri1", "http://example2.com/uri2"],
- "httpAuthenticationUsername": "username",
- "httpAuthenticationPassword": "password123"
-}
-```
-
-You can also use the other existing Druid PasswordProviders. Here is an example using the EnvironmentVariablePasswordProvider:
-
-```json
-{
- "type": "http",
- "uris": ["http://example.com/uri1", "http://example2.com/uri2"],
- "httpAuthenticationUsername": "username",
- "httpAuthenticationPassword": {
- "type": "environment",
- "variable": "HTTP_FIREHOSE_PW"
- }
-}
-```
-
-The below configurations can optionally be used for tuning the Firehose performance.
-Note that prefetching or caching isn't that useful in the Parallel task.
-
-|property|description|default|
-|--------|-----------|-------|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|
-|prefetchTriggerBytes|Threshold to trigger prefetching HTTP objects.|maxFetchCapacityBytes / 2|
-|fetchTimeout|Timeout for fetching an HTTP object.|60000|
-|maxFetchRetry|Maximum retries for fetching an HTTP object.|3|
-
-
-
-## IngestSegmentFirehose
-
-This Firehose can be used to read the data from existing druid segments, potentially using a new schema and changing the name, dimensions, metrics, rollup, etc. of the segment.
-This Firehose is _splittable_ and can be used by [native parallel index tasks](native-batch.md).
-This firehose will accept any type of parser, but will only utilize the list of dimensions and the timestamp specification.
- A sample ingest Firehose spec is shown below:
-
-```json
-{
- "type": "ingestSegment",
- "dataSource": "wikipedia",
- "interval": "2013-01-01/2013-01-02"
-}
-```
-
-|property|description|required?|
-|--------|-----------|---------|
-|type|This should be "ingestSegment".|yes|
-|dataSource|A String defining the data source to fetch rows from, very similar to a table in a relational database|yes|
-|interval|A String representing the ISO-8601 interval. This defines the time range to fetch the data over.|yes|
-|dimensions|The list of dimensions to select. If left empty, no dimensions are returned. If left null or not defined, all dimensions are returned. |no|
-|metrics|The list of metrics to select. If left empty, no metrics are returned. If left null or not defined, all metrics are selected.|no|
-|filter| See [Filters](../querying/filters.md)|no|
-|maxInputSegmentBytesPerTask|Deprecated. Use [Segments Split Hint Spec](./native-batch.md#segments-split-hint-spec) instead. When used with the native parallel index task, the maximum number of bytes of input segments to process in a single task. If a single segment is larger than this number, it will be processed by itself in a single task (input segments are never split across tasks). Defaults to 150MB.|no|
-
-
-
-## SqlFirehose
-
-This Firehose can be used to ingest events residing in an RDBMS. The database connection information is provided as part of the ingestion spec.
-For each query, the results are fetched locally and indexed.
-If there are multiple queries from which data needs to be indexed, queries are prefetched in the background, up to `maxFetchCapacityBytes` bytes.
-This Firehose is _splittable_ and can be used by [native parallel index tasks](native-batch.md).
-This firehose will accept any type of parser, but will only utilize the list of dimensions and the timestamp specification. See the extension documentation for more detailed ingestion examples.
-
-Requires one of the following extensions:
- * [MySQL Metadata Store](../development/extensions-core/mysql.md).
- * [PostgreSQL Metadata Store](../development/extensions-core/postgresql.md).
-
-
-```json
-{
- "type": "sql",
- "database": {
- "type": "mysql",
- "connectorConfig": {
- "connectURI": "jdbc:mysql://host:port/schema",
- "user": "user",
- "password": "password"
- }
- },
- "sqls": ["SELECT * FROM table1", "SELECT * FROM table2"]
-}
-```
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|type|This should be "sql".||Yes|
-|database|Specifies the database connection details. The database type corresponds to the extension that supplies the `connectorConfig` support. The specified extension must be loaded into Druid:
[mysql-metadata-storage](../development/extensions-core/mysql.md) for `mysql`
[postgresql-metadata-storage](../development/extensions-core/postgresql.md) extension for `postgresql`.
You can selectively allow JDBC properties in `connectURI`. See [JDBC connections security config](../configuration/index.md#jdbc-connections-to-external-databases) for more details.||Yes|
-|maxCacheCapacityBytes|Maximum size of the cache space in bytes. 0 means disabling cache. Cached files are not removed until the ingestion task completes.|1073741824|No|
-|maxFetchCapacityBytes|Maximum size of the fetch space in bytes. 0 means disabling prefetch. Prefetched files are removed immediately once they are read.|1073741824|No|
-|prefetchTriggerBytes|Threshold to trigger prefetching SQL result objects.|maxFetchCapacityBytes / 2|No|
-|fetchTimeout|Timeout for fetching the result set.|60000|No|
-|foldCase|Toggle case folding of database column names. This may be enabled in cases where the database returns case insensitive column names in query results.|false|No|
-|sqls|List of SQL queries where each SQL query would retrieve the data to be indexed.||Yes|
-
-### Database
-
-|property|description|default|required?|
-|--------|-----------|-------|---------|
-|type|The type of database to query. Valid values are `mysql` and `postgresql`_||Yes|
-|connectorConfig|Specify the database connection properties via `connectURI`, `user` and `password`||Yes|
-
-## InlineFirehose
-
-This Firehose can be used to read the data inlined in its own spec.
-It can be used for demos or for quickly testing out parsing and schema, and works with `string` typed parsers.
-A sample inline Firehose spec is shown below:
-
-```json
-{
- "type": "inline",
- "data": "0,values,formatted\n1,as,CSV"
-}
-```
-
-|property|description|required?|
-|--------|-----------|---------|
-|type|This should be "inline".|yes|
-|data|Inlined data to ingest.|yes|
-
-## CombiningFirehose
-
-This Firehose can be used to combine and merge data from a list of different Firehoses.
-
-```json
-{
- "type": "combining",
- "delegates": [ { firehose1 }, { firehose2 }, ... ]
-}
-```
-
-|property|description|required?|
-|--------|-----------|---------|
-|type|This should be "combining"|yes|
-|delegates|List of Firehoses to combine data from|yes|
\ No newline at end of file
diff --git a/docs/operations/clean-metadata-store.md b/docs/operations/clean-metadata-store.md
index 49f2555c9d27..80e3494a53d1 100644
--- a/docs/operations/clean-metadata-store.md
+++ b/docs/operations/clean-metadata-store.md
@@ -44,7 +44,7 @@ This applies to all metadata entities in this topic except compaction configurat
You can configure the retention period for each metadata type, when available, through the record's `durationToRetain` property.
Certain records may require additional conditions be satisfied before clean up occurs.
-See the [example](#example) for how you can customize the automated metadata cleanup for a specific use case.
+See the [example](#example-configuration-for-automated-metadata-cleanup) for how you can customize the automated metadata cleanup for a specific use case.
## Automated cleanup strategies
@@ -62,13 +62,12 @@ You can configure cleanup for each entity separately, as described in this secti
Define the properties in the `coordinator/runtime.properties` file.
The cleanup of one entity may depend on the cleanup of another entity as follows:
-- You have to configure a [kill task for segment records](#kill-task) before you can configure automated cleanup for [rules](#rules-records) or [compaction configuration](#compaction-configuration-records).
+- You have to configure a [kill task for segment records](#segment-records-and-segments-in-deep-storage-kill-task) before you can configure automated cleanup for [rules](#rules-records) or [compaction configuration](#compaction-configuration-records).
- You have to schedule the metadata management tasks to run at the same or higher frequency as your most frequent cleanup job. For example, if your most frequent cleanup job is every hour, set the metadata store management period to one hour or less: `druid.coordinator.period.metadataStoreManagementPeriod=P1H`.
For details on configuration properties, see [Metadata management](../configuration/index.md#metadata-management).
-If you want to skip the details, check out the [example](#example) for configuring automated metadata cleanup.
+If you want to skip the details, check out the [example](#example-configuration-for-automated-metadata-cleanup) for configuring automated metadata cleanup.
-
### Segment records and segments in deep storage (kill task)
:::info
@@ -110,7 +109,7 @@ Supervisor cleanup uses the following configuration:
### Rules records
-Rule records become eligible for deletion when all segments for the datasource have been killed by the kill task and the `durationToRetain` time has passed since their creation. Automated cleanup for rules requires a [kill task](#kill-task).
+Rule records become eligible for deletion when all segments for the datasource have been killed by the kill task and the `durationToRetain` time has passed since their creation. Automated cleanup for rules requires a [kill task](#segment-records-and-segments-in-deep-storage-kill-task).
Rule cleanup uses the following configuration:
- `druid.coordinator.kill.rule.on`: When `true`, enables cleanup for rules records.
@@ -129,7 +128,7 @@ To prevent the configuration from being prematurely removed, wait for the dataso
Unlike other metadata records, compaction configuration records do not have a retention period set by `durationToRetain`. Druid deletes compaction configuration records at every cleanup cycle for inactive datasources, which do not have segments either used or unused.
-Compaction configuration records in the `druid_config` table become eligible for deletion after all segments for the datasource have been killed by the kill task. Automated cleanup for compaction configuration requires a [kill task](#kill-task).
+Compaction configuration records in the `druid_config` table become eligible for deletion after all segments for the datasource have been killed by the kill task. Automated cleanup for compaction configuration requires a [kill task](#segment-records-and-segments-in-deep-storage-kill-task).
Compaction configuration cleanup uses the following configuration:
- `druid.coordinator.kill.compaction.on`: When `true`, enables cleanup for compaction configuration records.
@@ -153,7 +152,7 @@ Datasource cleanup uses the following configuration:
You can configure the Overlord to periodically delete indexer task logs and associated metadata. During cleanup, the Overlord removes the following:
* Indexer task logs from deep storage.
-* Indexer task log metadata from the tasks and tasklogs tables in [metadata storage](../configuration/index.md#metadata-storage) (named `druid_tasks` and `druid_tasklogs` by default). Druid no longer uses the tasklogs table, and the table is always empty.
+* Indexer task log metadata from the tasks table in [metadata storage](../configuration/index.md#metadata-storage) (named `druid_tasks` by default).
To configure cleanup of task logs by the Overlord, set the following properties in the `overlord/runtime.properties` file.
@@ -188,7 +187,6 @@ druid.coordinator.kill.rule.on=false
druid.coordinator.kill.datasource.on=false
```
-
## Example configuration for automated metadata cleanup
Consider a scenario where you have scripts to create and delete hundreds of datasources and related entities a day. You do not want to fill your metadata store with leftover records. The datasources and related entities tend to persist for only one or two days. Therefore, you want to run a cleanup job that identifies and removes leftover records that are at least four days old after a seven day buffer period in case you want to recover the data. The exception is for audit logs, which you need to retain for 30 days:
diff --git a/docs/operations/metrics.md b/docs/operations/metrics.md
index 1d37169684e1..b5be94ba4ba7 100644
--- a/docs/operations/metrics.md
+++ b/docs/operations/metrics.md
@@ -428,15 +428,6 @@ These metrics are available only when `druid.zk.service.enabled = true`.
|`zk/connected`|Indicator of connection status. `1` for connected, `0` for disconnected. Emitted once per monitor period.|None|1|
|`zk/reconnect/time`|Amount of time, in milliseconds, that a server was disconnected from ZooKeeper before reconnecting. Emitted on reconnection. Not emitted if connection to ZooKeeper is permanently lost, because in this case, there is no reconnection.|None|Not present|
-### EventReceiverFirehose
-
-The following metric is only available if the `EventReceiverFirehoseMonitor` module is included.
-
-|Metric|Description|Dimensions|Normal value|
-|------|-----------|----------|------------|
-|`ingest/events/buffered`|Number of events queued in the `EventReceiverFirehose` buffer.|`serviceName`, `dataSource`, `taskId`, `taskType`, `bufferCapacity`|Equal to the current number of events in the buffer queue.|
-|`ingest/bytes/received`|Number of bytes received by the `EventReceiverFirehose`.|`serviceName`, `dataSource`, `taskId`, `taskType`|Varies|
-
## Sys [Deprecated]
> SysMonitor is now deprecated and will be removed in future releases.
diff --git a/docs/operations/migrate-from-firehose-ingestion.md b/docs/operations/migrate-from-firehose-ingestion.md
index f470324b7f4e..540685a717f1 100644
--- a/docs/operations/migrate-from-firehose-ingestion.md
+++ b/docs/operations/migrate-from-firehose-ingestion.md
@@ -23,9 +23,7 @@ sidebar_label: "Migrate from firehose"
~ under the License.
-->
-Apache deprecated support for Druid firehoses in version 0.17. Support for firehose ingestion will be removed in version 26.0.
-
-If you're using a firehose for batch ingestion, we strongly recommend that you follow the instructions on this page to transition to using native batch ingestion input sources as soon as possible.
+Apache deprecated support for Druid firehoses in version 0.17. Support for firehose ingestion was removed in version 26.0.
Firehose ingestion doesn't work with newer Druid versions, so you must be using an ingestion spec with a defined input source before you upgrade.
diff --git a/docs/operations/request-logging.md b/docs/operations/request-logging.md
index 3bccfd3ef9aa..6ce65c0421e2 100644
--- a/docs/operations/request-logging.md
+++ b/docs/operations/request-logging.md
@@ -217,7 +217,6 @@ The following shows an example log emitter output:
"user",
"v0"
],
- "legacy": false,
"context":
{
"populateCache": false,
diff --git a/docs/operations/security-overview.md b/docs/operations/security-overview.md
index 279a1327b97d..96389c8a72c7 100644
--- a/docs/operations/security-overview.md
+++ b/docs/operations/security-overview.md
@@ -52,7 +52,7 @@ The following recommendations apply to the network where Druid runs:
The following recommendation applies to Druid's authorization and authentication model:
* Only grant `WRITE` permissions to any `DATASOURCE` to trusted users. Druid's trust model assumes those users have the same privileges as the operating system user that runs the web console process. Additionally, users with `WRITE` permissions can make changes to datasources and they have access to both task and supervisor update (POST) APIs which may affect ingestion.
* Only grant `STATE READ`, `STATE WRITE`, `CONFIG WRITE`, and `DATASOURCE WRITE` permissions to highly-trusted users. These permissions allow users to access resources on behalf of the Druid server process regardless of the datasource.
-* If your Druid client application allows less-trusted users to control the input source or firehose of an ingestion task, validate the URLs from the users. It is possible to point unchecked URLs to other locations and resources within your network or local file system.
+* If your Druid client application allows less-trusted users to control the input source of an ingestion task, validate the URLs from the users. It is possible to point unchecked URLs to other locations and resources within your network or local file system.
## Enable TLS
diff --git a/docs/querying/lookups.md b/docs/querying/lookups.md
index a22fbf03928c..05176f229b1f 100644
--- a/docs/querying/lookups.md
+++ b/docs/querying/lookups.md
@@ -50,6 +50,10 @@ Other lookup types are available as extensions, including:
- Globally cached lookups from local files, remote URIs, or JDBC through [lookups-cached-global](./lookups-cached-global.md).
- Globally cached lookups from a Kafka topic through [kafka-extraction-namespace](./kafka-extraction-namespace.md).
+:::info
+[Multi-value dimensions](multi-value-dimensions.md) (MVDs) are not supported as keys in lookups. For example, to map the MVD `["A", "B", "C"]` to the value `x` in your lookup, flatten the MVD and map each element of the MVD to the value. Your lookup will have separate key-value pairs for each element of the MVD: `"A": "x"`, `"B": "x"`, and `"C": "x"`.
+:::
+
Query Syntax
------------
diff --git a/docs/querying/scan-query.md b/docs/querying/scan-query.md
index d758450715ec..503664633ba3 100644
--- a/docs/querying/scan-query.md
+++ b/docs/querying/scan-query.md
@@ -66,7 +66,6 @@ The following are the main parameters for Scan queries:
|limit|How many rows to return. If not specified, all rows will be returned.|no|
|offset|Skip this many rows when returning results. Skipped rows will still need to be generated internally and then discarded, meaning that raising offsets to high values can cause queries to use additional resources.
Together, "limit" and "offset" can be used to implement pagination. However, note that if the underlying datasource is modified in between page fetches in ways that affect overall query results, then the different pages will not necessarily align with each other.|no|
|order|The ordering of returned rows based on timestamp. "ascending", "descending", and "none" (default) are supported. Currently, "ascending" and "descending" are only supported for queries where the `__time` column is included in the `columns` field and the requirements outlined in the [time ordering](#time-ordering) section are met.|none|
-|legacy|Return results consistent with the legacy "scan-query" contrib extension. Defaults to the value set by `druid.query.scan.legacy`, which in turn defaults to false. See [Legacy mode](#legacy-mode) for details.|no|
|context|An additional JSON Object which can be used to specify certain flags (see the `query context properties` section below).|no|
## Example results
@@ -159,14 +158,14 @@ The format of the result when resultFormat equals `compactedList`:
## Time ordering
-The Scan query currently supports ordering based on timestamp for non-legacy queries. Note that using time ordering
-will yield results that do not indicate which segment rows are from (`segmentId` will show up as `null`). Furthermore,
-time ordering is only supported where the result set limit is less than `druid.query.scan.maxRowsQueuedForOrdering`
-rows **or** all segments scanned have fewer than `druid.query.scan.maxSegmentPartitionsOrderedInMemory` partitions. Also,
-time ordering is not supported for queries issued directly to historicals unless a list of segments is specified. The
-reasoning behind these limitations is that the implementation of time ordering uses two strategies that can consume too
-much heap memory if left unbounded. These strategies (listed below) are chosen on a per-Historical basis depending on
-query result set limit and the number of segments being scanned.
+The Scan query currently supports ordering based on timestamp. Note that using time ordering will yield results that
+do not indicate which segment rows are from (`segmentId` will show up as `null`). Furthermore, time ordering is only
+supported where the result set limit is less than `druid.query.scan.maxRowsQueuedForOrdering` rows **or** all segments
+scanned have fewer than `druid.query.scan.maxSegmentPartitionsOrderedInMemory` partitions. Also, time ordering is not
+supported for queries issued directly to historicals unless a list of segments is specified. The reasoning behind
+these limitations is that the implementation of time ordering uses two strategies that can consume too much heap memory
+if left unbounded. These strategies (listed below) are chosen on a per-Historical basis depending on query result set
+limit and the number of segments being scanned.
1. Priority Queue: Each segment on a Historical is opened sequentially. Every row is added to a bounded priority
queue which is ordered by timestamp. For every row above the result set limit, the row with the earliest (if descending)
@@ -187,21 +186,6 @@ configurable and can be tuned based on hardware specs and number of dimensions b
can also be overridden using the `maxRowsQueuedForOrdering` and `maxSegmentPartitionsOrderedInMemory` properties in
the query context (see the Query Context Properties section).
-## Legacy mode
-
-The Scan query supports a legacy mode designed for protocol compatibility with the former scan-query contrib extension.
-In legacy mode you can expect the following behavior changes:
-
-- The `__time` column is returned as `"timestamp"` rather than `"__time"`. This will take precedence over any other column
-you may have that is named `"timestamp"`.
-- The `__time` column is included in the list of columns even if you do not specifically ask for it.
-- Timestamps are returned as ISO8601 time strings rather than integers (milliseconds since 1970-01-01 00:00:00 UTC).
-
-Legacy mode can be triggered either by passing `"legacy" : true` in your query JSON, or by setting
-`druid.query.scan.legacy = true` on your Druid processes. If you were previously using the scan-query contrib extension,
-the best way to migrate is to activate legacy mode during a rolling upgrade, then switch it off after the upgrade
-is complete.
-
## Configuration Properties
Configuration properties:
@@ -210,7 +194,6 @@ Configuration properties:
|--------|-----------|------|-------|
|druid.query.scan.maxRowsQueuedForOrdering|The maximum number of rows returned when time ordering is used|An integer in [1, 2147483647]|100000|
|druid.query.scan.maxSegmentPartitionsOrderedInMemory|The maximum number of segments scanned per historical when time ordering is used|An integer in [1, 2147483647]|50|
-|druid.query.scan.legacy|Whether legacy mode should be turned on for Scan queries|true or false|false|
## Query context properties
@@ -228,3 +211,7 @@ Sample query context JSON object:
"maxSegmentPartitionsOrderedInMemory": 100
}
```
+
+## Legacy mode
+
+In older versions of Druid, the scan query supported a legacy mode designed for protocol compatibility with the former scan-query contrib extension from versions of Druid older than 0.11. This mode has been removed.
diff --git a/docs/querying/sql-functions.md b/docs/querying/sql-functions.md
index 7151c23b9181..b04664ac90e3 100644
--- a/docs/querying/sql-functions.md
+++ b/docs/querying/sql-functions.md
@@ -27,18 +27,46 @@ sidebar_label: "All functions"
Apache Druid supports two query languages: Druid SQL and [native queries](querying.md).
This document describes the SQL language.
:::
+
+This page provides a reference of Apache Druid® SQL functions in alphabetical order. For more details on a function, refer to the following:
+* [Aggregation functions](sql-aggregations.md)
+* [Array functions](sql-array-functions.md)
+* [JSON functions](sql-json-functions.md)
+* [Multi-value string functions](sql-multivalue-string-functions.md)
+* [Scalar functions](sql-scalar.md)
+* [Window functions](sql-window-functions.md)
+
+The examples on this page use the following example datasources:
+* `flight-carriers` using `FlightCarrierOnTime (1 month)`
+* `taxi-trips` using `NYC Taxi cabs (3 files)`
+## ABS
-This page provides a reference of all Druid SQL functions in alphabetical order.
-Click the linked function type for documentation on a particular function.
+Calculates the absolute value of a numeric expression.
-## ABS
+* **Syntax:** `ABS()`
+* **Function type:** Scalar, numeric
-`ABS()`
+Example
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+The following example applies the ABS function to the `ArrDelay` column from the `flight-carriers` datasource.
-Calculates the absolute value of a numeric expression.
+```sql
+SELECT
+ "ArrDelay" AS "arrival_delay",
+ ABS("ArrDelay") AS "absolute_arrival_delay"
+FROM "flight-carriers"
+WHERE "ArrDelay" < 0
+LIMIT 1
+```
+Returns the following:
+
+| `arrival_delay` | `absolute_arrival_delay` |
+| -- | -- |
+| `-27` | `27` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## ACOS
@@ -64,6 +92,7 @@ Returns any value of the specified expression.
Counts distinct values of a regular column or a prebuilt sketch column.
+## APPROX_COUNT_DISTINCT_BUILTIN
`APPROX_COUNT_DISTINCT_BUILTIN(expr)`
**Function type:** [Aggregation](sql-aggregations.md)
@@ -419,11 +448,29 @@ Rounds up a timestamp by a given time unit.
## CEIL (numeric)
-`CEIL()`
+Calculates the smallest integer value greater than or equal to the numeric expression.
+* **Syntax:** `CEIL()`
+* **Function type:** Scalar, numeric
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+Example
-Calculates the smallest integer value greater than or equal to the numeric expression.
+The following example applies the CEIL function to the `fare_amount` column from the `taxi-trips` datasource.
+
+```sql
+SELECT
+ "fare_amount" AS "fare_amount",
+ CEIL("fare_amount") AS "ceiling_fare_amount"
+FROM "taxi-trips"
+LIMIT 1
+```
+Returns the following:
+
+| `fare_amount` | `ceiling_fare_amount` |
+| -- | -- |
+| `21.25` | `22` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## CHAR_LENGTH
@@ -697,11 +744,26 @@ Returns the value of a numeric or string expression corresponding to the earlies
## EXP
-`EXP()`
+Calculates _e_ raised to the power of the numeric expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `EXP()`
+* **Function type:** Scalar, numeric
-Calculates _e_ raised to the power of the numeric expression.
+Example
+
+The following example calculates _e_ to the power of 1.
+
+```sql
+SELECT EXP(1) AS "exponential"
+```
+Returns the following:
+
+| `exponential` |
+| -- |
+| `2.7182818284590455` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## EXTRACT
@@ -729,11 +791,30 @@ Rounds down a timestamp by a given time unit.
## FLOOR (numeric)
-`FLOOR()`
+Calculates the largest integer less than or equal to the numeric expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `FLOOR()`
+* **Function type:** Scalar, numeric
+
+Example
+
+The following example applies the FLOOR function to the `fare_amount` column from the `taxi-trips` datasource.
+
+```sql
+SELECT
+ "fare_amount" AS "fare_amount",
+ FLOOR("fare_amount") AS "floor_fare_amount"
+FROM "taxi-trips"
+LIMIT 1
+```
+Returns the following:
-Calculates the largest integer value less than or equal to the numeric expression.
+| `fare_amount` | `floor_fare_amount` |
+| -- | -- |
+| `21.25` | `21` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## GREATEST
@@ -961,19 +1042,57 @@ Returns the length of the expression in UTF-16 encoding.
## LN
-`LN(expr)`
+Calculates the natural logarithm of the numeric expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `LN()`
+* **Function type:** Scalar, numeric
-Calculates the natural logarithm of the numeric expression.
+Example
+
+The following example applies the LN function to the `max_temperature` column from the `taxi-trips` datasource.
+
+```sql
+SELECT
+ "max_temperature" AS "max_temperature",
+ LN("max_temperature") AS "natural_log_max_temp"
+FROM "taxi-trips"
+LIMIT 1
+```
+Returns the following:
+
+| `max_temperature` | `natural_log_max_temp` |
+| -- | -- |
+| `76` | `4.330733340286331` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## LOG10
-`LOG10(expr)`
+Calculates the base-10 logarithm of the numeric expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `LOG10()`
+* **Function type:** Scalar, numeric
+
+Example
+
+The following example applies the LOG10 function to the `max_temperature` column from the `taxi-trips` datasource.
+
+```sql
+SELECT
+ "max_temperature" AS "max_temperature",
+ LOG10("max_temperature") AS "log10_max_temp"
+FROM "taxi-trips"
+LIMIT 1
+```
+Returns the following:
+
+| `max_temperature` | `log10_max_temp` |
+| -- | -- |
+| `76` | `1.8808135922807914` |
+
-Calculates the base-10 of the numeric expression.
+[Learn more](sql-scalar.md#numeric-functions)
## LOOKUP
@@ -1033,11 +1152,26 @@ Returns the minimum value of a set of values.
## MOD
-`MOD(x, y)`
+Calculates x modulo y, or the remainder of x divided by y. Where x and y are numeric expressions.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `MOD(x, y)`
+* **Function type:** Scalar, numeric
+
+Example
+
+The following calculates 78 MOD 10.
+
+```sql
+SELECT MOD(78, 10) as "modulo"
+```
+Returns the following:
+
+| `modulo` |
+| -- |
+| `8` |
+
-Calculates x modulo y, or the remainder of x divided by y.
+[Learn more](sql-scalar.md#numeric-functions)
## MV_APPEND
@@ -1217,11 +1351,26 @@ Returns the one-based index position of a substring within an expression, option
## POWER
-`POWER(expr, power)`
+Calculates a numerical expression raised to the specified power.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `POWER(base, exponent)`
+* **Function type:** Scalar, numeric
-Calculates a numerical expression raised to the specified power.
+Example
+
+The following example raises 5 to the power of 2.
+
+```sql
+SELECT POWER(5, 2) AS "power"
+```
+Returns the following:
+
+| `power` |
+| -- |
+| `25` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## RADIANS
@@ -1298,11 +1447,31 @@ Returns the rightmost number of characters from an expression.
## ROUND
-`ROUND(expr[, digits])`
+Calculates the rounded value for a numerical expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `ROUND(expr[, digits])`
+* **Function type:** Scalar, numeric
-Calculates the rounded value for a numerical expression.
+Example
+
+The following applies the ROUND function to 0 decimal points on the `pickup_longitude` column from the `taxi-trips` datasource.
+
+```sql
+SELECT
+ "pickup_longitude" AS "pickup_longitude",
+ ROUND("pickup_longitude", 0) as "rounded_pickup_longitude"
+FROM "taxi-trips"
+WHERE "pickup_longitude" IS NOT NULL
+LIMIT 1
+```
+Returns the following:
+
+| `pickup_longitude` | `rounded_pickup_longitude` |
+| -- | -- |
+| `-73.9377670288086` | `-74` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## ROW_NUMBER
@@ -1346,11 +1515,26 @@ Calculates the trigonometric sine of an angle expressed in radians.
## SQRT
-`SQRT(expr)`
+Calculates the square root of a numeric expression.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `SQRT()`
+* **Function type:** Scalar, numeric
-Calculates the square root of a numeric expression.
+Example
+
+The following example calculates the square root of 25.
+
+```sql
+SELECT SQRT(25) AS "square_root"
+```
+Returns the following:
+
+| `square_root` |
+| -- |
+| `5` |
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## STDDEV
@@ -1620,20 +1804,41 @@ Trims the leading or trailing characters of an expression.
## TRUNC
-`TRUNC(expr[, digits])`
+Alias for [`TRUNCATE`](#truncate).
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `TRUNC(expr[, digits])`
+* **Function type:** Scalar, numeric
-Alias for [`TRUNCATE`](#truncate).
+[Learn more](sql-scalar.md#numeric-functions)
## TRUNCATE
-`TRUNCATE(expr[, digits])`
+Truncates a numerical expression to a specific number of decimal digits.
-**Function type:** [Scalar, numeric](sql-scalar.md#numeric-functions)
+* **Syntax:** `TRUNCATE(expr[, digits])`
+* **Function type:** Scalar, numeric
-Truncates a numerical expression to a specific number of decimal digits.
+Example
+
+The following applies the TRUNCATE function to 1 decimal place on the `pickup_longitude` column from the `taxi-trips` datasource.
+```sql
+SELECT
+ "pickup_longitude" as "pickup_longitude",
+ TRUNCATE("pickup_longitude", 1) as "truncate_pickup_longitude"
+FROM "taxi-trips"
+WHERE "pickup_longitude" IS NOT NULL
+LIMIT 1
+```
+Returns the following:
+
+| `pickup_longitude` | `truncate_pickup_longitude` |
+| -- | -- |
+| `-73.9377670288086` | `-73.9` |
+
+
+
+[Learn more](sql-scalar.md#numeric-functions)
## TRY_PARSE_JSON
@@ -1683,3 +1888,4 @@ Calculates the sample variance of a set of values.
Alias for [`VAR_SAMP`](#var_samp).
+
diff --git a/docs/querying/sql-translation.md b/docs/querying/sql-translation.md
index 056d63ece57c..e430caa8bf09 100644
--- a/docs/querying/sql-translation.md
+++ b/docs/querying/sql-translation.md
@@ -342,7 +342,6 @@ The above EXPLAIN PLAN returns the following result:
"regionName",
"v0"
],
- "legacy": false,
"context": {
"finalizeAggregations": false,
"forceExpressionVirtualColumns": true,
@@ -562,7 +561,6 @@ The above EXPLAIN PLAN query returns the following result:
"regionName",
"v0"
],
- "legacy": false,
"context": {
"finalizeAggregations": false,
"groupByEnableMultiValueUnnesting": false,
diff --git a/docs/querying/sql-window-functions.md b/docs/querying/sql-window-functions.md
index d64538779f07..7c2c3aef53e9 100644
--- a/docs/querying/sql-window-functions.md
+++ b/docs/querying/sql-window-functions.md
@@ -246,11 +246,8 @@ Druid has guardrail logic to prevent you from executing window function queries
For example:
- You cannot set expressions as bounds for window frames.
-- You cannot use two FOLLOWING expressions in the window frame. For example: `ROWS BETWEEN 2 ROWS FOLLOWING and 3 ROWS FOLLOWING`.
- You can only use a RANGE frames when both endpoints are unbounded or current row.
-If you write a query that violates one of these conditions, Druid throws an error: "The query contains a window frame which may return incorrect results. To disregard this warning, set `windowingStrictValidation` to false in the query context."
-
## Window function reference
|Function|Notes|
diff --git a/docs/release-info/migr-ansi-sql-null.md b/docs/release-info/migr-ansi-sql-null.md
new file mode 100644
index 000000000000..71c7f30ffb00
--- /dev/null
+++ b/docs/release-info/migr-ansi-sql-null.md
@@ -0,0 +1,386 @@
+---
+id: migr-ansi-sql-null
+title: "Migration guide: SQL compliant mode"
+sidebar_label: SQL compliant mode
+---
+
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+In Apache Druid 28.0.0, the default [null handling](../querying/sql-data-types.md#null-values) mode changed to be compliant with the ANSI SQL standard.
+This guide provides strategies for Druid operators who rely on legacy Druid null handling behavior in their applications to transition to SQL compliant mode.
+Legacy mode is planned for removal from Druid.
+
+## SQL compliant null handling
+
+As of Druid 28.0.0, Druid writes segments in an ANSI SQL compatible null handling mode by default.
+This means that Druid stores null values distinctly from empty strings for string dimensions and distinctly from 0 for numeric dimensions.
+
+This can impact your application behavior because the ANSI SQL standard defines any comparison to null to be unknown.
+According to this three-valued logic, `x <> 'some value'` only returns non-null values.
+
+The default Druid configurations for 28.0.0 and later that enable ANSI SQL compatible null handling mode are the following:
+
+* `druid.generic.useDefaultValueForNull=false`
+* `druid.expressions.useStrictBooleans=true`
+* `druid.generic.useThreeValueLogicForNativeFilters=true`
+
+Follow the [Null handling tutorial](../tutorials/tutorial-sql-null.md) to learn how the default null handling works in Druid.
+
+## Legacy null handling and two-valued logic
+
+Prior to Druid 28.0.0, Druid defaulted to a legacy mode which stored default values instead of nulls.
+In legacy mode, Druid created segments with the following characteristics at ingestion time:
+
+- String columns couldn't distinguish an empty string, `''`, from null.
+ Therefore, Druid treated them both as interchangeable values.
+- Numeric columns couldn't represent null valued rows.
+ Therefore, Druid stored `0` instead of `null`.
+
+The Druid configurations for the deprecated legacy mode were the following:
+
+* `druid.generic.useDefaultValueForNull=true`
+* `druid.expressions.useStrictBooleans=false`
+* `druid.generic.useThreeValueLogicForNativeFilters=true`
+
+These configurations are deprecated and scheduled for removal.
+After the configurations are removed, Druid will ignore them if they exist in your configuration files and use the default SQL compliant mode.
+
+## Migrate to SQL compliant mode
+
+If your business logic relies on the behavior of legacy mode, you have the following options to operate Druid in an ANSI SQL compatible null handling mode:
+
+- Modify incoming data to either [avoid nulls](#replace-null-values-at-ingestion-time) or [avoid empty strings](#coerce-empty-strings-to-null-at-ingestion-time) to achieve the same query behavior as legacy mode. This means modifying your ingestion SQL queries and ingestion specs to handle nulls or empty strings.
+ For example, replacing a null for a string column with an empty string or a 0 for a numeric column.
+ However, it means that your existing queries should operate as if Druid were in legacy mode.
+ If you do not care about preserving null values, this is a good option for you.
+
+- Preserve null values and [update all of your SQL queries to be ANSI SQL compliant](#rewrite-your-queries-to-be-sql-compliant).
+ This means you can preserve the incoming data with nulls intact.
+ However, you must rewrite any affected client-side queries to be ANSI SQL compliant.
+ If you have a requirement to preserve null values, choose this option.
+
+### Replace null values at ingestion time
+
+If you don't need to preserve null values within Druid, you can use a transform at ingestion time to replace nulls with other values.
+
+Consider the following input data:
+
+```json
+{"time":"2024-01-01T00:00:00.000Z","string_example":"my_string","number_example":99}
+{"time":"2024-01-02T00:00:00.000Z","string_example":"","number_example":0}
+{"time":"2024-01-03T00:00:00.000Z","string_example":null,"number_example":null}
+```
+
+The following example illustrates how to use COALESCE and NVL at ingestion time to avoid null values in Druid:
+
+
+
+
+
+```sql
+REPLACE INTO "no_nulls_example" OVERWRITE ALL
+WITH "ext" AS (
+ SELECT *
+ FROM TABLE(
+ EXTERN(
+ '{"type":"inline","data":"{\"time\":\"2024-01-01T00:00:00.000Z\",\"string_example\":\"my_string\",\"number_example\":99}\n{\"time\":\"2024-01-02T00:00:00.000Z\",\"string_example\":\"\",\"number_example\":0}\n{\"time\":\"2024-01-03T00:00:00.000Z\",\"string_example\":null,\"number_example\":null}"}',
+ '{"type":"json"}'
+ )
+ ) EXTEND ("time" VARCHAR, "string_example" VARCHAR, "number_example" BIGINT)
+)
+SELECT
+ TIME_PARSE("time") AS "__time",
+ -- Replace any null string values with an empty string
+ COALESCE("string_example",'') AS string_example,
+ -- Replace any null numeric values with 0
+ NVL("number_example",0) AS number_example
+FROM "ext"
+PARTITIONED BY MONTH
+```
+
+
+
+
+```json
+{
+ "type": "index_parallel",
+ "spec": {
+ "ioConfig": {
+ "type": "index_parallel",
+ "inputSource": {
+ "type": "inline",
+ "data": "{\"time\":\"2024-01-01T00:00:00.000Z\",\"string_example\":\"my_string\",\"number_example\":99}\n{\"time\":\"2024-01-02T00:00:00.000Z\",\"string_example\":\"\",\"number_example\":0}\n{\"time\":\"2024-01-03T00:00:00.000Z\",\"string_example\":null,\"number_example\":null}"
+ },
+ "inputFormat": {
+ "type": "json"
+ }
+ },
+ "tuningConfig": {
+ "type": "index_parallel",
+ "partitionsSpec": {
+ "type": "dynamic"
+ }
+ },
+ "dataSchema": {
+ "dataSource": "inline_data_native",
+ "timestampSpec": {
+ "column": "time",
+ "format": "iso"
+ },
+ "dimensionsSpec": {
+ "dimensions": [
+ "string_example",
+ {
+ "type": "long",
+ "name": "number_example"
+ }
+ ]
+ },
+ "granularitySpec": {
+ "queryGranularity": "none",
+ "rollup": false,
+ "segmentGranularity": "MONTH"
+ },
+ "transformSpec": {
+ "transforms": [
+ {
+ "type": "expression",
+ "name": "string_example",
+ "expression": "COALESCE(\"string_example\",'')"
+ },
+ {
+ "type": "expression",
+ "name": "number_example",
+ "expression": "NVL(\"number_example\",0)"
+ }
+ ]
+ }
+ }
+ }
+}
+```
+
+
+
+
+Druid ingests the data with no null values as follows:
+
+| `__time` | `string_examle` | `number_example`|
+| -- | -- | -- |
+| `2024-01-01T00:00:00.000Z`| `my_string`| 99 |
+| `2024-01-02T00:00:00.000Z`| `empty`| 0 |
+| `2024-01-03T00:00:00.000Z`| `empty`| 0 |
+
+### Coerce empty strings to null at ingestion time
+
+In legacy mode, Druid recognized empty strings as nulls for equality comparison.
+If your queries rely on empty strings to represent nulls, you can coerce empty strings to null at ingestion time using NULLIF.
+
+For example, consider the following sample input data:
+
+```json
+{"time":"2024-01-01T00:00:00.000Z","string_example":"my_string"}
+{"time":"2024-01-02T00:00:00.000Z","string_example":""}
+{"time":"2024-01-03T00:00:00.000Z","string_example":null}
+```
+
+In legacy mode, Druid wrote an empty string for the third record.
+Therefore the following query returned 2:
+
+```sql
+SELECT count(*)
+FROM "null_string"
+WHERE "string_example" IS NULL
+```
+
+In SQL compliant mode, Druid differentiates between empty strings and nulls, so the same query would return 1.
+The following example shows how to coerce empty strings into null to accommodate IS NULL comparisons:
+
+
+
+
+
+```sql
+REPLACE INTO "null_string" OVERWRITE ALL
+WITH "ext" AS (
+ SELECT *
+ FROM TABLE(
+ EXTERN(
+ '{"type":"inline","data":"{\"time\":\"2024-01-01T00:00:00.000Z\",\"string_example\":\"my_string\"}\n{\"time\":\"2024-01-02T00:00:00.000Z\",\"string_example\":\"\"}\n{\"time\":\"2024-01-03T00:00:00.000Z\",\"string_example\":null}"}',
+ '{"type":"json"}'
+ )
+ ) EXTEND ("time" VARCHAR, "string_example" VARCHAR)
+)
+SELECT
+ TIME_PARSE("time") AS "__time",
+ NULLIF("string_example",'') AS "string_example"
+FROM "ext"
+PARTITIONED BY MONTH
+```
+
+
+
+
+
+```json
+{
+ "type": "index_parallel",
+ "spec": {
+ "ioConfig": {
+ "type": "index_parallel",
+ "inputSource": {
+ "type": "inline",
+ "data": "{\"time\":\"2024-01-01T00:00:00.000Z\",\"string_example\":\"my_string\"}\n{\"time\":\"2024-01-02T00:00:00.000Z\",\"string_example\":\"\"}\n{\"time\":\"2024-01-03T00:00:00.000Z\",\"string_example\":null}"
+ },
+ "inputFormat": {
+ "type": "json"
+ }
+ },
+ "tuningConfig": {
+ "type": "index_parallel",
+ "partitionsSpec": {
+ "type": "dynamic"
+ }
+ },
+ "dataSchema": {
+ "dataSource": "null_string",
+ "timestampSpec": {
+ "column": "time",
+ "format": "iso"
+ },
+ "transformSpec": {
+ "transforms": [
+ {
+ "type": "expression",
+ "expression": "case_searched((\"string_example\" == ''),null,\"string_example\")",
+ "name": "string_example"
+ }
+ ]
+ },
+ "dimensionsSpec": {
+ "dimensions": [
+ "string_example"
+ ]
+ },
+ "granularitySpec": {
+ "queryGranularity": "none",
+ "rollup": false,
+ "segmentGranularity": "month"
+ }
+ }
+ }
+}
+```
+
+
+
+
+Druid ingests the data with no empty strings as follows:
+
+| `__time` | `string_examle` |
+| -- | -- | -- |
+| `2024-01-01T00:00:00.000Z`| `my_string`|
+| `2024-01-02T00:00:00.000Z`| `null`|
+| `2024-01-03T00:00:00.000Z`| `null`|
+
+Therefore `SELECT count(*) FROM "null_string" WHERE "string_example" IS NULL` returns 2.
+
+### Rewrite your queries to be SQL compliant
+
+If you want to maintain null values in your data within Druid, you can use the following ANSI SQL compliant querying strategies to achieve the same results as legacy null handling:
+
+- Modify inequality queries to include null values.
+ For example, `x <> 'some value'` becomes `(x <> 'some value' OR x IS NULL)`.
+- Use COALESCE or NVL to replace nulls with a value.
+ For example, `x + 1` becomes `NVL(numeric_value, 0)+1`
+
+Consider the following Druid datasource `null_example`:
+
+| `__time` | `string_examle` | `number_example`|
+| -- | -- | -- |
+| `2024-01-01T00:00:00.000Z`| `my_string`| 99 |
+| `2024-01-02T00:00:00.000Z`| `empty`| 0 |
+| `2024-01-03T00:00:00.000Z`| `null`| null |
+
+Druid excludes null strings from equality comparisons. For example:
+
+```sql
+SELECT COUNT(*) AS count_example
+FROM "null_example"
+WHERE "string_example"<> 'my_string'
+```
+
+Druid returns 1 because null is considered unknown: neither equal nor unequal to the value.
+
+To count null values in the result, you can use an OR operator:
+
+```sql
+SELECT COUNT(*) AS count_example
+FROM "null_example"
+WHERE ("string_example"<> 'my_string') OR "string_example" IS NULL
+```
+
+Druid returns 2.
+To achieve the same result, you can use IS DISTINCT FROM for null-safe comparison:
+
+```sql
+SELECT COUNT(*) as count_example
+FROM "null_example"
+WHERE "string_example" IS DISTINCT FROM 'my_string'
+```
+
+Similarly, arithmetic operators on null return null. For example:
+
+```sql
+SELECT "number_example" + 1 AS additon_example
+FROM "null_example"
+```
+
+Druid returns the following because null + any value is null for the ANSI SQL standard:
+
+| `addition_example`|
+| -- |
+| 100 |
+| 1 |
+| null |
+
+Use NVL to avoid nulls with arithmetic. For example:
+
+```sql
+SELECT NVL("number_example",0) + 1 AS additon_example
+FROM "null_example"
+```
+
+Druid returns the following:
+
+| `addition_example` |
+| -- |
+| 100 |
+| 1 |
+| null |
+
+## Learn more
+
+See the following topics for more information:
+ - [Null handling tutorial](../tutorials/tutorial-sql-null.md) to learn how the default null handling works in Druid.
+ - [Null values](../querying/sql-data-types.md#null-values) for a description of Druid's behavior with null values.
+ - [Handling null values](../design/segments.md#handling-null-values) for details about how Druid stores null values.
\ No newline at end of file
diff --git a/docs/release-info/migration-guide.md b/docs/release-info/migration-guide.md
index 92053b83a11c..760b691d143d 100644
--- a/docs/release-info/migration-guide.md
+++ b/docs/release-info/migration-guide.md
@@ -30,12 +30,18 @@ The guides in this section outline breaking changes introduced in Druid 25.0.0 a
## Migrate from multi-value dimensions to arrays
-Druid now supports SQL-compliant array types. Whenever possible, you should use the array type over multi-value dimensions. See [Migration guide: MVDs to arrays](migr-mvd-array.md).
+Druid now supports SQL-compliant array types. Whenever possible, you should use the array type over multi-value dimensions. See [Migration guide: MVDs to arrays](./migr-mvd-array.md).
## Migrate to front-coded dictionary encoding
-Druid encodes string columns into dictionaries for better compression. Front-coded dictionary encoding reduces storage and improves performance by optimizing for strings that share similar beginning substrings. See [Migration guide: front-coded dictionaries](migr-front-coded-dict.md) for more information.
+Druid encodes string columns into dictionaries for better compression. Front-coded dictionary encoding reduces storage and improves performance by optimizing for strings that share similar beginning substrings. See [Migration guide: front-coded dictionaries](./migr-front-coded-dict.md) for more information.
## Migrate from `maxSubqueryRows` to `maxSubqueryBytes`
-Druid allows you to set a byte-based limit on subquery size to prevent Brokers from running out of memory when handling large subqueries. The byte-based subquery limit overrides Druid's row-based subquery limit. We recommend that you move towards using byte-based limits starting in Druid 30.0.0. See [Migration guide: subquery limit](migr-subquery-limit.md) for more information.
+Druid allows you to set a byte-based limit on subquery size to prevent Brokers from running out of memory when handling large subqueries. The byte-based subquery limit overrides Druid's row-based subquery limit. We recommend that you move towards using byte-based limits starting in Druid 30.0.0. See [Migration guide: subquery limit](./migr-subquery-limit.md) for more information.
+
+## Migrate to SQL compliant null handling mode
+
+By default, the Druid [null handling](../querying/sql-data-types.md#null-values) mode is now compliant with ANSI SQL.
+This guide provides strategies for Druid operators and users who rely on the legacy Druid null handling behavior in their applications to transition to ANSI SQL compliant mode. See [Migration guide: SQL compliant mode](./migr-ansi-sql-null.md
+) for more information.
\ No newline at end of file
diff --git a/docs/tutorials/tutorial-sql-null.md b/docs/tutorials/tutorial-sql-null.md
index 37cf23a71522..b91e8019bc25 100644
--- a/docs/tutorials/tutorial-sql-null.md
+++ b/docs/tutorials/tutorial-sql-null.md
@@ -118,7 +118,7 @@ Druid returns the following:
|`another_value`|1|1|
|`some_value`|1|1|
-Also note that GROUP BY expressions yields distinct entries for `null` and the empty string.
+Also note that GROUP BY expressions yield distinct entries for `null` and the empty string.
### Filter for empty strings in addition to null
diff --git a/docs/tutorials/tutorial-unnest-arrays.md b/docs/tutorials/tutorial-unnest-arrays.md
index 86b5407e0e7c..e9fdeb158c1e 100644
--- a/docs/tutorials/tutorial-unnest-arrays.md
+++ b/docs/tutorials/tutorial-unnest-arrays.md
@@ -304,7 +304,6 @@ The following native Scan query returns the rows of the datasource and unnests t
"m2",
"unnest-dim3"
],
- "legacy": false,
"granularity": {
"type": "all"
},
@@ -480,7 +479,6 @@ This query joins the `nested_data` table with itself and outputs the unnested da
"m2",
"v0"
],
- "legacy": false,
"context": {
"sqlOuterLimit": 1001,
"useNativeQueryExplain": true
@@ -523,7 +521,6 @@ This query joins the `nested_data` table with itself and outputs the unnested da
"m2",
"unnest-dim3"
],
- "legacy": false,
"context": {
"sqlOuterLimit": 1001,
"useNativeQueryExplain": true
@@ -572,7 +569,6 @@ The following query returns the columns `dim45` and `m1`. The `dim45` column is
"dim45",
"m1"
],
- "legacy": false,
"granularity": {
"type": "all"
},
@@ -632,7 +628,6 @@ The following Scan query unnests the column `dim3` into `d3` and a virtual colum
"dim5",
"d45"
],
- "legacy": false,
"context": {
"queryId": "2618b9ce-6c0d-414e-b88d-16fb59b9c481",
"sqlOuterLimit": 1001,
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
index 6c195ed15151..568f8ed5a117 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/common/DruidK8sConstants.java
@@ -28,6 +28,7 @@ public class DruidK8sConstants
public static final String TASK_TYPE = "task.type";
public static final String TASK_GROUP_ID = "task.group.id";
public static final String TASK_DATASOURCE = "task.datasource";
+ public static final String TASK_JOB_TEMPLATE = "task.jobTemplate";
public static final int PORT = 8100;
public static final int TLS_PORT = 8091;
public static final int DEFAULT_CPU_MILLICORES = 1000;
@@ -42,6 +43,7 @@ public class DruidK8sConstants
public static final String DRUID_HOSTNAME_ENV = "HOSTNAME";
public static final String LABEL_KEY = "druid.k8s.peons";
public static final String DRUID_LABEL_PREFIX = "druid.";
+ public static final String BASE_TEMPLATE_NAME = "base";
public static final long MAX_ENV_VARIABLE_KBS = 130048; // 127 KB
static final Predicate IS_TRANSIENT = e -> e instanceof KubernetesResourceNotFoundException;
}
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/PodTemplateSelectStrategy.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/PodTemplateSelectStrategy.java
index ae7869707fca..885a4e3e59a9 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/PodTemplateSelectStrategy.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/PodTemplateSelectStrategy.java
@@ -23,6 +23,7 @@
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import io.fabric8.kubernetes.api.model.PodTemplate;
import org.apache.druid.indexing.common.task.Task;
+import org.apache.druid.k8s.overlord.taskadapter.PodTemplateWithName;
import javax.validation.constraints.NotNull;
import java.util.Map;
@@ -42,7 +43,7 @@ public interface PodTemplateSelectStrategy
* allows for customized resource allocation and management tailored to the task's specific requirements.
*
* @param task The task for which the Pod template is determined.
- * @return The pod template that should be used to run the task.
+ * @return The PodTemplateWithName POJO that contains the name of the template selected and the template itself.
*/
- @NotNull PodTemplate getPodTemplateForTask(Task task, Map templates);
+ @NotNull PodTemplateWithName getPodTemplateForTask(Task task, Map templates);
}
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategy.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategy.java
index 4c2d01b5218b..bf3082a79b1e 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategy.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategy.java
@@ -24,6 +24,8 @@
import com.google.common.base.Preconditions;
import io.fabric8.kubernetes.api.model.PodTemplate;
import org.apache.druid.indexing.common.task.Task;
+import org.apache.druid.k8s.overlord.common.DruidK8sConstants;
+import org.apache.druid.k8s.overlord.taskadapter.PodTemplateWithName;
import java.util.List;
import java.util.Map;
@@ -53,15 +55,18 @@ public SelectorBasedPodTemplateSelectStrategy(
* @return the template if a selector matches, otherwise fallback to base template
*/
@Override
- public PodTemplate getPodTemplateForTask(Task task, Map templates)
+ public PodTemplateWithName getPodTemplateForTask(Task task, Map templates)
{
String templateKey = selectors.stream()
.filter(selector -> selector.evaluate(task))
.findFirst()
.map(Selector::getSelectionKey)
- .orElse("base");
+ .orElse(DruidK8sConstants.BASE_TEMPLATE_NAME);
- return templates.getOrDefault(templateKey, templates.get("base"));
+ if (!templates.containsKey(templateKey)) {
+ templateKey = DruidK8sConstants.BASE_TEMPLATE_NAME;
+ }
+ return new PodTemplateWithName(templateKey, templates.get(templateKey));
}
@JsonProperty
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/TaskTypePodTemplateSelectStrategy.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/TaskTypePodTemplateSelectStrategy.java
index b374e0b6ff40..bda7788e3c41 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/TaskTypePodTemplateSelectStrategy.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/execution/TaskTypePodTemplateSelectStrategy.java
@@ -22,6 +22,8 @@
import com.fasterxml.jackson.annotation.JsonCreator;
import io.fabric8.kubernetes.api.model.PodTemplate;
import org.apache.druid.indexing.common.task.Task;
+import org.apache.druid.k8s.overlord.common.DruidK8sConstants;
+import org.apache.druid.k8s.overlord.taskadapter.PodTemplateWithName;
import java.util.Map;
@@ -40,9 +42,10 @@ public TaskTypePodTemplateSelectStrategy()
}
@Override
- public PodTemplate getPodTemplateForTask(Task task, Map templates)
+ public PodTemplateWithName getPodTemplateForTask(Task task, Map templates)
{
- return templates.getOrDefault(task.getType(), templates.get("base"));
+ String templateKey = templates.containsKey(task.getType()) ? task.getType() : DruidK8sConstants.BASE_TEMPLATE_NAME;
+ return new PodTemplateWithName(templateKey, templates.get(templateKey));
}
@Override
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
index 19cc788b3eea..e8aaf1bbab35 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapter.java
@@ -138,19 +138,21 @@ public Job fromTask(Task task) throws IOException
podTemplateSelectStrategy = dynamicConfig.getPodTemplateSelectStrategy();
}
- PodTemplate podTemplate = podTemplateSelectStrategy.getPodTemplateForTask(task, templates);
+ PodTemplateWithName podTemplateWithName = podTemplateSelectStrategy.getPodTemplateForTask(task, templates);
return new JobBuilder()
.withNewMetadata()
.withName(new K8sTaskId(task).getK8sJobName())
.addToLabels(getJobLabels(taskRunnerConfig, task))
.addToAnnotations(getJobAnnotations(taskRunnerConfig, task))
+ .addToAnnotations(DruidK8sConstants.TASK_JOB_TEMPLATE, podTemplateWithName.getName())
.endMetadata()
.withNewSpec()
- .withTemplate(podTemplate.getTemplate())
+ .withTemplate(podTemplateWithName.getPodTemplate().getTemplate())
.editTemplate()
.editOrNewMetadata()
.addToAnnotations(getPodTemplateAnnotations(task))
+ .addToAnnotations(DruidK8sConstants.TASK_JOB_TEMPLATE, podTemplateWithName.getName())
.addToLabels(getPodLabels(taskRunnerConfig, task))
.endMetadata()
.editSpec()
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateWithName.java b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateWithName.java
new file mode 100644
index 000000000000..eeebd9fdb1fd
--- /dev/null
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/main/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateWithName.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.k8s.overlord.taskadapter;
+
+import io.fabric8.kubernetes.api.model.PodTemplate;
+
+import javax.annotation.Nonnull;
+import java.util.Objects;
+
+public class PodTemplateWithName
+{
+ private final String name;
+ private final PodTemplate podTemplate;
+
+ public PodTemplateWithName(String name, PodTemplate podTemplate)
+ {
+ this.name = name;
+ this.podTemplate = podTemplate;
+ }
+
+ @Nonnull
+ public String getName()
+ {
+ return name;
+ }
+
+ @Nonnull
+ public PodTemplate getPodTemplate()
+ {
+ return podTemplate;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ PodTemplateWithName that = (PodTemplateWithName) o;
+ return Objects.equals(name, that.name) &&
+ Objects.equals(podTemplate, that.podTemplate);
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(name, podTemplate);
+ }
+
+ @Override
+ public String toString()
+ {
+ return "PodTemplateWithName{" +
+ "name='" + name + '\'' +
+ ", podTemplate=" + podTemplate +
+ '}';
+ }
+}
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
index 04b332aac850..b3fda99b222e 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/K8sTestUtils.java
@@ -79,7 +79,6 @@ public static Task getTask()
null
),
new IndexTask.IndexIOConfig(
- null,
new LocalInputSource(new File("lol"), "rofl"),
new NoopInputFormat(),
true,
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/PodTemplateWithNameTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/PodTemplateWithNameTest.java
new file mode 100644
index 000000000000..58259606c86b
--- /dev/null
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/common/PodTemplateWithNameTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.k8s.overlord.common;
+
+import io.fabric8.kubernetes.api.model.PodTemplate;
+import io.fabric8.kubernetes.api.model.PodTemplateBuilder;
+import org.apache.druid.k8s.overlord.taskadapter.PodTemplateWithName;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+
+public class PodTemplateWithNameTest
+{
+ @Test
+ void testEqualityToMakeCoverageHappy()
+ {
+ PodTemplateWithName podTemplateWithName = new PodTemplateWithName(
+ "name",
+ new PodTemplateBuilder().build()
+ );
+ PodTemplateWithName podTemplateWithName2 = podTemplateWithName;
+
+ assertEquals(podTemplateWithName, podTemplateWithName2);
+ assertNotEquals(podTemplateWithName, null);
+ assertNotEquals(podTemplateWithName, "string");
+ assertEquals(podTemplateWithName.hashCode(), podTemplateWithName2.hashCode());
+ }
+
+ @Test
+ void testGettersToMakeCoverageHappy()
+ {
+ String name = "name";
+ PodTemplate podTemplate = new PodTemplateBuilder().build();
+ PodTemplateWithName podTemplateWithName = new PodTemplateWithName(
+ name,
+ podTemplate
+ );
+
+ assertEquals(name, podTemplateWithName.getName());
+ assertEquals(podTemplate, podTemplateWithName.getPodTemplate());
+ }
+}
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategyTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategyTest.java
index a82bb0768553..81589c751283 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategyTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/execution/SelectorBasedPodTemplateSelectStrategyTest.java
@@ -27,6 +27,7 @@
import io.fabric8.kubernetes.api.model.PodTemplate;
import org.apache.druid.indexing.common.task.NoopTask;
import org.apache.druid.indexing.common.task.Task;
+import org.apache.druid.k8s.overlord.taskadapter.PodTemplateWithName;
import org.apache.druid.segment.TestHelper;
import org.junit.Assert;
import org.junit.Before;
@@ -97,7 +98,10 @@ public void testGetPodTemplate_ForTask_emptySelectorsFallbackToBaseTemplate()
List emptySelectors = Collections.emptyList();
SelectorBasedPodTemplateSelectStrategy strategy = new SelectorBasedPodTemplateSelectStrategy(emptySelectors);
Task task = NoopTask.create();
- Assert.assertEquals("base", strategy.getPodTemplateForTask(task, templates).getMetadata().getName());
+ PodTemplateWithName podTemplateWithName = strategy.getPodTemplateForTask(task, templates);
+ Assert.assertEquals("base", podTemplateWithName.getName());
+ Assert.assertEquals("base", podTemplateWithName.getPodTemplate().getMetadata().getName());
+
}
@Test
@@ -107,7 +111,9 @@ public void testGetPodTemplate_ForTask_noMatchSelectorsFallbackToBaseTemplateIfN
List selectors = Collections.singletonList(noMatchSelector);
SelectorBasedPodTemplateSelectStrategy strategy = new SelectorBasedPodTemplateSelectStrategy(selectors);
Task task = NoopTask.create();
- Assert.assertEquals("base", strategy.getPodTemplateForTask(task, templates).getMetadata().getName());
+ PodTemplateWithName podTemplateWithName = strategy.getPodTemplateForTask(task, templates);
+ Assert.assertEquals("base", podTemplateWithName.getName());
+ Assert.assertEquals("base", podTemplateWithName.getPodTemplate().getMetadata().getName());
}
@Test
@@ -124,7 +130,9 @@ public void testGetPodTemplate_ForTask_withMatchSelectors()
);
SelectorBasedPodTemplateSelectStrategy strategy = new SelectorBasedPodTemplateSelectStrategy(selectors);
Task task = NoopTask.create();
- Assert.assertEquals("match", strategy.getPodTemplateForTask(task, templates).getMetadata().getName());
+ PodTemplateWithName podTemplateWithName = strategy.getPodTemplateForTask(task, templates);
+ Assert.assertEquals("match", podTemplateWithName.getName());
+ Assert.assertEquals("match", podTemplateWithName.getPodTemplate().getMetadata().getName());
}
@Test
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/DruidPeonClientIntegrationTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/DruidPeonClientIntegrationTest.java
index 241b4d9fc68f..e2d97accc97f 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/DruidPeonClientIntegrationTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/DruidPeonClientIntegrationTest.java
@@ -19,7 +19,6 @@
package org.apache.druid.k8s.overlord.taskadapter;
-import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import io.fabric8.kubernetes.api.model.Pod;
@@ -27,7 +26,6 @@
import io.fabric8.kubernetes.api.model.batch.v1.Job;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
-import org.apache.druid.guice.FirehoseModule;
import org.apache.druid.indexing.common.TestUtils;
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.indexing.common.config.TaskConfigBuilder;
@@ -83,9 +81,6 @@ public void setup()
{
TestUtils utils = new TestUtils();
jsonMapper = utils.getTestObjectMapper();
- for (Module jacksonModule : new FirehoseModule().getJacksonModules()) {
- jsonMapper.registerModule(jacksonModule);
- }
jsonMapper.registerSubtypes(
new NamedType(ParallelIndexTuningConfig.class, "index_parallel"),
new NamedType(IndexTask.IndexTuningConfig.class, "index")
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapterTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapterTest.java
index 102565efc35a..f8e7186a0263 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapterTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/K8sTaskAdapterTest.java
@@ -19,7 +19,6 @@
package org.apache.druid.k8s.overlord.taskadapter;
-import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.google.common.base.Joiner;
@@ -44,7 +43,6 @@
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.druid.error.DruidException;
-import org.apache.druid.guice.FirehoseModule;
import org.apache.druid.indexing.common.TestUtils;
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.indexing.common.config.TaskConfigBuilder;
@@ -101,9 +99,6 @@ public K8sTaskAdapterTest()
{
TestUtils utils = new TestUtils();
jsonMapper = utils.getTestObjectMapper();
- for (Module jacksonModule : new FirehoseModule().getJacksonModules()) {
- jsonMapper.registerModule(jacksonModule);
- }
jsonMapper.registerSubtypes(
new NamedType(ParallelIndexTuningConfig.class, "index_parallel"),
new NamedType(IndexTask.IndexTuningConfig.class, "index")
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/MultiContainerTaskAdapterTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/MultiContainerTaskAdapterTest.java
index 58993b9a6a00..9308835d967f 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/MultiContainerTaskAdapterTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/MultiContainerTaskAdapterTest.java
@@ -19,7 +19,6 @@
package org.apache.druid.k8s.overlord.taskadapter;
-import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.google.common.collect.ImmutableList;
@@ -28,7 +27,6 @@
import io.fabric8.kubernetes.api.model.batch.v1.Job;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.server.mock.EnableKubernetesMockClient;
-import org.apache.druid.guice.FirehoseModule;
import org.apache.druid.indexing.common.TestUtils;
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.indexing.common.config.TaskConfigBuilder;
@@ -67,9 +65,6 @@ public void setup()
{
TestUtils utils = new TestUtils();
jsonMapper = utils.getTestObjectMapper();
- for (Module jacksonModule : new FirehoseModule().getJacksonModules()) {
- jsonMapper.registerModule(jacksonModule);
- }
jsonMapper.registerSubtypes(
new NamedType(ParallelIndexTuningConfig.class, "index_parallel"),
new NamedType(IndexTask.IndexTuningConfig.class, "index")
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
index 4aad419007e8..ac2aaa705581 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/PodTemplateTaskAdapterTest.java
@@ -163,7 +163,7 @@ public void test_fromTask_withBasePodTemplateInRuntimeProperites() throws IOExce
Task task = new NoopTask("id", "id", "datasource", 0, 0, null);
Job actual = adapter.fromTask(task);
- Job expected = K8sTestUtils.fileToResource("expectedNoopJob.yaml", Job.class);
+ Job expected = K8sTestUtils.fileToResource("expectedNoopJobBase.yaml", Job.class);
assertJobSpecsEqual(actual, expected);
}
@@ -197,7 +197,7 @@ public void test_fromTask_withBasePodTemplateInRuntimeProperites_andTlsEnabled()
Task task = new NoopTask("id", "id", "datasource", 0, 0, null);
Job actual = adapter.fromTask(task);
- Job expected = K8sTestUtils.fileToResource("expectedNoopJobTlsEnabled.yaml", Job.class);
+ Job expected = K8sTestUtils.fileToResource("expectedNoopJobTlsEnabledBase.yaml", Job.class);
assertJobSpecsEqual(actual, expected);
}
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/SingleContainerTaskAdapterTest.java b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/SingleContainerTaskAdapterTest.java
index afc5299927fa..4cf9c3e4cbbc 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/SingleContainerTaskAdapterTest.java
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/java/org/apache/druid/k8s/overlord/taskadapter/SingleContainerTaskAdapterTest.java
@@ -19,14 +19,12 @@
package org.apache.druid.k8s.overlord.taskadapter;
-import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.batch.v1.Job;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.server.mock.EnableKubernetesMockClient;
-import org.apache.druid.guice.FirehoseModule;
import org.apache.druid.indexing.common.TestUtils;
import org.apache.druid.indexing.common.config.TaskConfig;
import org.apache.druid.indexing.common.config.TaskConfigBuilder;
@@ -66,9 +64,6 @@ public void setup()
{
TestUtils utils = new TestUtils();
jsonMapper = utils.getTestObjectMapper();
- for (Module jacksonModule : new FirehoseModule().getJacksonModules()) {
- jsonMapper.registerModule(jacksonModule);
- }
jsonMapper.registerSubtypes(
new NamedType(ParallelIndexTuningConfig.class, "index_parallel"),
new NamedType(IndexTask.IndexTuningConfig.class, "index")
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
index 2cef837f3972..ddae7c0567f2 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJob.yaml
@@ -8,11 +8,13 @@ metadata:
druid.task.type: "noop"
druid.task.group.id: "id"
druid.task.datasource: "datasource"
+
annotations:
task.id: "id"
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
activeDeadlineSeconds: 14400
backoffLimit: 0
@@ -32,6 +34,7 @@ spec:
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
containers:
- command:
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml
new file mode 100644
index 000000000000..532c3dd53e82
--- /dev/null
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobBase.yaml
@@ -0,0 +1,55 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: "id-3e70afe5cd823dfc7dd308eea616426b"
+ labels:
+ druid.k8s.peons: "true"
+ druid.task.id: "id"
+ druid.task.type: "noop"
+ druid.task.group.id: "id"
+ druid.task.datasource: "datasource"
+
+ annotations:
+ task.id: "id"
+ task.type: "noop"
+ task.group.id: "id"
+ task.datasource: "datasource"
+ task.jobTemplate: base
+spec:
+ activeDeadlineSeconds: 14400
+ backoffLimit: 0
+ ttlSecondsAfterFinished: 172800
+ template:
+ metadata:
+ labels:
+ druid.k8s.peons: "true"
+ druid.task.id: "id"
+ druid.task.type: "noop"
+ druid.task.group.id: "id"
+ druid.task.datasource: "datasource"
+ annotations:
+ task: "H4sIAAAAAAAAAD2MvQ4CIRCE32VqijsTG1qLi7W+wArEbHICrmC8EN7dJf40k/lmJtNQthxgEVPKMGCvXsXgKqnm4x89FTqlKm6MBzw+YCA1nvmm8W4/TQYuxRJeBbZ17cJ3ZhvoSbzShVcu2zLOf9cS7pUl+ANlclrCzr2/AQUK0FqZAAAA"
+ tls.enabled: "false"
+ task.id: "id"
+ task.type: "noop"
+ task.group.id: "id"
+ task.datasource: "datasource"
+ task.jobTemplate: base
+ spec:
+ containers:
+ - command:
+ - sleep
+ - "3600"
+ env:
+ - name: "TASK_DIR"
+ value: "/tmp"
+ - name: "TASK_ID"
+ value: "id"
+ - name: "LOAD_BROADCAST_SEGMENTS"
+ value: "false"
+ - name: "TASK_JSON"
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.annotations['task']"
+ image: one
+ name: primary
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
index cf16c49c5db1..d6c316dcdde8 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobLongIds.yaml
@@ -13,6 +13,7 @@ metadata:
task.type: "noop"
task.group.id: "api-issued_kill_wikipedia3_omjobnbc_1000-01-01T00:00:00.000Z_2023-05-14T00:00:00.000Z_2023-05-15T17:03:01.220Z"
task.datasource: "data_source"
+ task.jobTemplate: noop
spec:
activeDeadlineSeconds: 14400
backoffLimit: 0
@@ -32,6 +33,7 @@ spec:
task.type: "noop"
task.group.id: "api-issued_kill_wikipedia3_omjobnbc_1000-01-01T00:00:00.000Z_2023-05-14T00:00:00.000Z_2023-05-15T17:03:01.220Z"
task.datasource: "data_source"
+ task.jobTemplate: noop
spec:
containers:
- command:
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
index d72d0ef37b03..90ae99709598 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobNoTaskJson.yaml
@@ -13,6 +13,7 @@ metadata:
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
activeDeadlineSeconds: 14400
backoffLimit: 0
@@ -31,6 +32,7 @@ spec:
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
containers:
- command:
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabled.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabled.yaml
index a230ac913a60..724054454142 100644
--- a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabled.yaml
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabled.yaml
@@ -13,6 +13,7 @@ metadata:
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
activeDeadlineSeconds: 14400
backoffLimit: 0
@@ -32,6 +33,7 @@ spec:
task.type: "noop"
task.group.id: "id"
task.datasource: "datasource"
+ task.jobTemplate: noop
spec:
containers:
- command:
diff --git a/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml
new file mode 100644
index 000000000000..0e52beac9e32
--- /dev/null
+++ b/extensions-contrib/kubernetes-overlord-extensions/src/test/resources/expectedNoopJobTlsEnabledBase.yaml
@@ -0,0 +1,54 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: "id-3e70afe5cd823dfc7dd308eea616426b"
+ labels:
+ druid.k8s.peons: "true"
+ druid.task.id: "id"
+ druid.task.type: "noop"
+ druid.task.group.id: "id"
+ druid.task.datasource: "datasource"
+ annotations:
+ task.id: "id"
+ task.type: "noop"
+ task.group.id: "id"
+ task.datasource: "datasource"
+ task.jobTemplate: base
+spec:
+ activeDeadlineSeconds: 14400
+ backoffLimit: 0
+ ttlSecondsAfterFinished: 172800
+ template:
+ metadata:
+ labels:
+ druid.k8s.peons: "true"
+ druid.task.id: "id"
+ druid.task.type: "noop"
+ druid.task.group.id: "id"
+ druid.task.datasource: "datasource"
+ annotations:
+ task: "H4sIAAAAAAAAAD2MvQ4CIRCE32VqijsTG1qLi7W+wArEbHICrmC8EN7dJf40k/lmJtNQthxgEVPKMGCvXsXgKqnm4x89FTqlKm6MBzw+YCA1nvmm8W4/TQYuxRJeBbZ17cJ3ZhvoSbzShVcu2zLOf9cS7pUl+ANlclrCzr2/AQUK0FqZAAAA"
+ tls.enabled: "true"
+ task.id: "id"
+ task.type: "noop"
+ task.group.id: "id"
+ task.datasource: "datasource"
+ task.jobTemplate: base
+ spec:
+ containers:
+ - command:
+ - sleep
+ - "3600"
+ env:
+ - name: "TASK_DIR"
+ value: "/tmp"
+ - name: "TASK_ID"
+ value: "id"
+ - name: "LOAD_BROADCAST_SEGMENTS"
+ value: "false"
+ - name: "TASK_JSON"
+ valueFrom:
+ fieldRef:
+ fieldPath: "metadata.annotations['task']"
+ image: one
+ name: primary
diff --git a/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java b/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
index 7d59cb63ea5b..01039375259e 100644
--- a/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
+++ b/extensions-contrib/materialized-view-maintenance/src/main/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpec.java
@@ -47,7 +47,7 @@
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec;
-import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
+import org.apache.druid.segment.realtime.ChatHandlerProvider;
import org.apache.druid.segment.transform.TransformSpec;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.timeline.DataSegment;
diff --git a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpecTest.java b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpecTest.java
index 14bd59871253..65e71f626e8e 100644
--- a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpecTest.java
+++ b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorSpecTest.java
@@ -40,8 +40,8 @@
import org.apache.druid.query.aggregation.LongSumAggregatorFactory;
import org.apache.druid.query.expression.LookupEnabledTestExprMacroTable;
import org.apache.druid.segment.TestHelper;
-import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
-import org.apache.druid.segment.realtime.firehose.NoopChatHandlerProvider;
+import org.apache.druid.segment.realtime.ChatHandlerProvider;
+import org.apache.druid.segment.realtime.NoopChatHandlerProvider;
import org.apache.druid.server.security.AuthorizerMapper;
import org.easymock.EasyMock;
import org.hamcrest.CoreMatchers;
diff --git a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
index 38f15a840dc2..ff1759195869 100644
--- a/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
+++ b/extensions-contrib/materialized-view-maintenance/src/test/java/org/apache/druid/indexing/materializedview/MaterializedViewSupervisorTest.java
@@ -52,7 +52,7 @@
import org.apache.druid.segment.indexing.DataSchema;
import org.apache.druid.segment.metadata.CentralizedDatasourceSchemaConfig;
import org.apache.druid.segment.metadata.SegmentSchemaManager;
-import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
+import org.apache.druid.segment.realtime.ChatHandlerProvider;
import org.apache.druid.segment.transform.TransformSpec;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.timeline.DataSegment;
diff --git a/extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json b/extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json
index 7e625c9edb3d..9cb01646f209 100644
--- a/extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json
+++ b/extensions-contrib/prometheus-emitter/src/main/resources/defaultMetrics.json
@@ -201,9 +201,6 @@
"zk/connected" : { "dimensions" : [], "type" : "gauge", "help": "Indicator of connection status to zookeeper."},
"zk/reconnect/time" : { "dimensions" : [], "type" : "timer", "conversionFactor": 1000.0, "help": "Amount of time, in seconds, that a server was disconnected from ZooKeeper before reconnecting." },
- "ingest/events/buffered" : { "dimensions" : ["serviceName", "dataSource", "bufferCapacity"], "type" : "gauge", "help": "Number of events queued in the EventReceiverFirehose's buffer"},
- "ingest/bytes/received" : { "dimensions" : ["serviceName", "dataSource"], "type" : "gauge", "help": "Number of bytes received by the EventReceiverFirehose."},
-
"sys/swap/free" : { "dimensions" : [], "type" : "gauge", "help": "Free swap"},
"sys/swap/max" : { "dimensions" : [], "type" : "gauge", "help": "Max swap"},
"sys/swap/pageIn" : { "dimensions" : [], "type" : "gauge", "help": "Paged in swap"},
diff --git a/extensions-core/azure-extensions/pom.xml b/extensions-core/azure-extensions/pom.xml
index 88a1198b61e1..7f0f767af851 100644
--- a/extensions-core/azure-extensions/pom.xml
+++ b/extensions-core/azure-extensions/pom.xml
@@ -204,10 +204,6 @@
org.jacocojacoco-maven-plugin
-
-
- org/apache/druid/firehose/azure/**/*
- BUNDLE
diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchMergeAggregatorFactory.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchMergeAggregatorFactory.java
index 833df8ab1a55..7d0bb79c71e1 100644
--- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchMergeAggregatorFactory.java
+++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchMergeAggregatorFactory.java
@@ -24,6 +24,7 @@
import org.apache.datasketches.hll.HllSketch;
import org.apache.datasketches.hll.TgtHllType;
import org.apache.datasketches.hll.Union;
+import org.apache.druid.error.DruidException;
import org.apache.druid.java.util.common.StringEncoding;
import org.apache.druid.query.aggregation.Aggregator;
import org.apache.druid.query.aggregation.AggregatorFactory;
@@ -34,7 +35,9 @@
import org.apache.druid.segment.ColumnInspector;
import org.apache.druid.segment.ColumnSelectorFactory;
import org.apache.druid.segment.ColumnValueSelector;
+import org.apache.druid.segment.column.ColumnCapabilities;
import org.apache.druid.segment.column.ColumnType;
+import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
import javax.annotation.Nullable;
@@ -107,6 +110,8 @@ protected byte getCacheTypeId()
@Override
public Aggregator factorize(final ColumnSelectorFactory columnSelectorFactory)
{
+ validateInputs(columnSelectorFactory.getColumnCapabilities(getFieldName()));
+
final ColumnValueSelector selector = columnSelectorFactory.makeColumnValueSelector(getFieldName());
return new HllSketchMergeAggregator(selector, getLgK(), TgtHllType.valueOf(getTgtHllType()));
}
@@ -115,6 +120,8 @@ public Aggregator factorize(final ColumnSelectorFactory columnSelectorFactory)
@Override
public BufferAggregator factorizeBuffered(final ColumnSelectorFactory columnSelectorFactory)
{
+ validateInputs(columnSelectorFactory.getColumnCapabilities(getFieldName()));
+
final ColumnValueSelector selector = columnSelectorFactory.makeColumnValueSelector(getFieldName());
return new HllSketchMergeBufferAggregator(
selector,
@@ -133,6 +140,7 @@ public boolean canVectorize(ColumnInspector columnInspector)
@Override
public VectorAggregator factorizeVector(VectorColumnSelectorFactory selectorFactory)
{
+ validateInputs(selectorFactory.getColumnCapabilities(getFieldName()));
return new HllSketchMergeVectorAggregator(
selectorFactory,
getFieldName(),
@@ -142,6 +150,34 @@ public VectorAggregator factorizeVector(VectorColumnSelectorFactory selectorFact
);
}
+ /**
+ * Validates whether the aggregator supports the input column type.
+ * Supported column types are complex types of HLLSketch, HLLSketchBuild, HLLSketchMerge, as well as UNKNOWN_COMPLEX.
+ * @param capabilities
+ */
+ private void validateInputs(@Nullable ColumnCapabilities capabilities)
+ {
+ if (capabilities != null) {
+ final ColumnType type = capabilities.toColumnType();
+ boolean isSupportedComplexType = ValueType.COMPLEX.equals(type.getType()) &&
+ (
+ HllSketchModule.TYPE_NAME.equals(type.getComplexTypeName()) ||
+ HllSketchModule.BUILD_TYPE_NAME.equals(type.getComplexTypeName()) ||
+ HllSketchModule.MERGE_TYPE_NAME.equals(type.getComplexTypeName()) ||
+ type.getComplexTypeName() == null
+ );
+ if (!isSupportedComplexType) {
+ throw DruidException.forPersona(DruidException.Persona.USER)
+ .ofCategory(DruidException.Category.UNSUPPORTED)
+ .build(
+ "Using aggregator [%s] is not supported for complex columns with type [%s].",
+ getIntermediateType().getComplexTypeName(),
+ type
+ );
+ }
+ }
+ }
+
@Override
public int getMaxIntermediateSize()
{
diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchApproxCountDistinctSqlAggregator.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchApproxCountDistinctSqlAggregator.java
index 757674d6aa68..7d303d272740 100644
--- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchApproxCountDistinctSqlAggregator.java
+++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchApproxCountDistinctSqlAggregator.java
@@ -21,28 +21,68 @@
import org.apache.calcite.sql.SqlAggFunction;
import org.apache.calcite.sql.SqlFunctionCategory;
+import org.apache.calcite.sql.type.CastedLiteralOperandTypeCheckers;
import org.apache.calcite.sql.type.InferTypes;
+import org.apache.calcite.sql.type.OperandTypes;
+import org.apache.calcite.sql.type.SqlSingleOperandTypeChecker;
import org.apache.calcite.sql.type.SqlTypeFamily;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.druid.java.util.common.StringEncoding;
+import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.aggregation.AggregatorFactory;
+import org.apache.druid.query.aggregation.datasketches.hll.HllSketchBuildAggregatorFactory;
+import org.apache.druid.query.aggregation.datasketches.hll.HllSketchMergeAggregatorFactory;
import org.apache.druid.query.aggregation.post.FinalizingFieldAccessPostAggregator;
import org.apache.druid.sql.calcite.aggregation.Aggregation;
import org.apache.druid.sql.calcite.aggregation.SqlAggregator;
import org.apache.druid.sql.calcite.expression.OperatorConversions;
+import org.apache.druid.sql.calcite.table.RowSignatures;
import java.util.Collections;
+/**
+ * Approximate count distinct aggregator using HLL sketches.
+ * Supported column types: String, Numeric, HLLSketchMerge, HLLSketchBuild.
+ */
public class HllSketchApproxCountDistinctSqlAggregator extends HllSketchBaseSqlAggregator implements SqlAggregator
{
public static final String NAME = "APPROX_COUNT_DISTINCT_DS_HLL";
+
+ private static final SqlSingleOperandTypeChecker AGGREGATED_COLUMN_TYPE_CHECKER = OperandTypes.or(
+ OperandTypes.STRING,
+ OperandTypes.NUMERIC,
+ RowSignatures.complexTypeChecker(HllSketchMergeAggregatorFactory.TYPE),
+ RowSignatures.complexTypeChecker(HllSketchBuildAggregatorFactory.TYPE)
+ );
+
private static final SqlAggFunction FUNCTION_INSTANCE =
OperatorConversions.aggregatorBuilder(NAME)
- .operandNames("column", "lgK", "tgtHllType")
- .operandTypes(SqlTypeFamily.ANY, SqlTypeFamily.NUMERIC, SqlTypeFamily.STRING)
.operandTypeInference(InferTypes.VARCHAR_1024)
- .requiredOperandCount(1)
- .literalOperands(1, 2)
+ .operandTypeChecker(
+ OperandTypes.or(
+ // APPROX_COUNT_DISTINCT_DS_HLL(column)
+ AGGREGATED_COLUMN_TYPE_CHECKER,
+ // APPROX_COUNT_DISTINCT_DS_HLL(column, lgk)
+ OperandTypes.and(
+ OperandTypes.sequence(
+ StringUtils.format("'%s(column, lgk)'", NAME),
+ AGGREGATED_COLUMN_TYPE_CHECKER,
+ CastedLiteralOperandTypeCheckers.POSITIVE_INTEGER_LITERAL
+ ),
+ OperandTypes.family(SqlTypeFamily.ANY, SqlTypeFamily.EXACT_NUMERIC)
+ ),
+ // APPROX_COUNT_DISTINCT_DS_HLL(column, lgk, tgtHllType)
+ OperandTypes.and(
+ OperandTypes.sequence(
+ StringUtils.format("'%s(column, lgk, tgtHllType)'", NAME),
+ AGGREGATED_COLUMN_TYPE_CHECKER,
+ CastedLiteralOperandTypeCheckers.POSITIVE_INTEGER_LITERAL,
+ OperandTypes.STRING
+ ),
+ OperandTypes.family(SqlTypeFamily.ANY, SqlTypeFamily.EXACT_NUMERIC, SqlTypeFamily.STRING)
+ )
+ )
+ )
.returnTypeNonNull(SqlTypeName.BIGINT)
.functionCategory(SqlFunctionCategory.NUMERIC)
.build();
diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchBaseSqlAggregator.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchBaseSqlAggregator.java
index d221b72ac1c6..15221c0f6f81 100644
--- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchBaseSqlAggregator.java
+++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/sql/HllSketchBaseSqlAggregator.java
@@ -31,6 +31,7 @@
import org.apache.druid.query.aggregation.datasketches.hll.HllSketchAggregatorFactory;
import org.apache.druid.query.aggregation.datasketches.hll.HllSketchBuildAggregatorFactory;
import org.apache.druid.query.aggregation.datasketches.hll.HllSketchMergeAggregatorFactory;
+import org.apache.druid.query.aggregation.datasketches.hll.HllSketchModule;
import org.apache.druid.query.dimension.DefaultDimensionSpec;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.segment.column.ColumnType;
@@ -40,6 +41,7 @@
import org.apache.druid.sql.calcite.expression.DruidExpression;
import org.apache.druid.sql.calcite.expression.Expressions;
import org.apache.druid.sql.calcite.planner.Calcites;
+import org.apache.druid.sql.calcite.planner.PlannerConfig;
import org.apache.druid.sql.calcite.planner.PlannerContext;
import org.apache.druid.sql.calcite.rel.InputAccessor;
import org.apache.druid.sql.calcite.rel.VirtualColumnRegistry;
@@ -115,7 +117,7 @@ public Aggregation toDruidAggregation(
if (columnArg.isDirectColumnAccess()
&& inputAccessor.getInputRowSignature()
.getColumnType(columnArg.getDirectColumn())
- .map(type -> type.is(ValueType.COMPLEX))
+ .map(this::isValidComplexInputType)
.orElse(false)) {
aggregatorFactory = new HllSketchMergeAggregatorFactory(
aggregatorName,
@@ -154,6 +156,15 @@ public Aggregation toDruidAggregation(
}
if (inputType.is(ValueType.COMPLEX)) {
+ if (!isValidComplexInputType(inputType)) {
+ plannerContext.setPlanningError(
+ "Using APPROX_COUNT_DISTINCT() or enabling approximation with COUNT(DISTINCT) is not supported for"
+ + " column type [%s]. You can disable approximation by setting [%s: false] in the query context.",
+ columnArg.getDruidType(),
+ PlannerConfig.CTX_KEY_USE_APPROXIMATE_COUNT_DISTINCT
+ );
+ return null;
+ }
aggregatorFactory = new HllSketchMergeAggregatorFactory(
aggregatorName,
dimensionSpec.getOutputName(),
@@ -192,4 +203,11 @@ protected abstract Aggregation toAggregation(
boolean finalizeAggregations,
AggregatorFactory aggregatorFactory
);
+
+ private boolean isValidComplexInputType(ColumnType columnType)
+ {
+ return HllSketchMergeAggregatorFactory.TYPE.equals(columnType) ||
+ HllSketchModule.TYPE_NAME.equals(columnType.getComplexTypeName()) ||
+ HllSketchModule.BUILD_TYPE_NAME.equals(columnType.getComplexTypeName());
+ }
}
diff --git a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/theta/SketchAggregatorFactory.java b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/theta/SketchAggregatorFactory.java
index 211373e873b0..b24e382ec0aa 100644
--- a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/theta/SketchAggregatorFactory.java
+++ b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/theta/SketchAggregatorFactory.java
@@ -27,7 +27,7 @@
import org.apache.datasketches.theta.SetOperation;
import org.apache.datasketches.theta.Union;
import org.apache.datasketches.thetacommon.ThetaUtil;
-import org.apache.druid.error.InvalidInput;
+import org.apache.druid.error.DruidException;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.aggregation.AggregateCombiner;
import org.apache.druid.query.aggregation.Aggregator;
@@ -41,6 +41,7 @@
import org.apache.druid.segment.ColumnSelectorFactory;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.column.ColumnCapabilities;
+import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
import javax.annotation.Nullable;
@@ -80,10 +81,7 @@ public SketchAggregatorFactory(String name, String fieldName, Integer size, byte
@Override
public Aggregator factorize(ColumnSelectorFactory metricFactory)
{
- ColumnCapabilities capabilities = metricFactory.getColumnCapabilities(fieldName);
- if (capabilities != null && capabilities.isArray()) {
- throw InvalidInput.exception("ARRAY types are not supported for theta sketch");
- }
+ validateInputs(metricFactory.getColumnCapabilities(fieldName));
BaseObjectColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName);
return new SketchAggregator(selector, size);
}
@@ -91,10 +89,7 @@ public Aggregator factorize(ColumnSelectorFactory metricFactory)
@Override
public AggregatorAndSize factorizeWithSize(ColumnSelectorFactory metricFactory)
{
- ColumnCapabilities capabilities = metricFactory.getColumnCapabilities(fieldName);
- if (capabilities != null && capabilities.isArray()) {
- throw InvalidInput.exception("ARRAY types are not supported for theta sketch");
- }
+ validateInputs(metricFactory.getColumnCapabilities(fieldName));
BaseObjectColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName);
final SketchAggregator aggregator = new SketchAggregator(selector, size);
return new AggregatorAndSize(aggregator, aggregator.getInitialSizeBytes());
@@ -104,10 +99,7 @@ public AggregatorAndSize factorizeWithSize(ColumnSelectorFactory metricFactory)
@Override
public BufferAggregator factorizeBuffered(ColumnSelectorFactory metricFactory)
{
- ColumnCapabilities capabilities = metricFactory.getColumnCapabilities(fieldName);
- if (capabilities != null && capabilities.isArray()) {
- throw InvalidInput.exception("ARRAY types are not supported for theta sketch");
- }
+ validateInputs(metricFactory.getColumnCapabilities(fieldName));
BaseObjectColumnValueSelector selector = metricFactory.makeColumnValueSelector(fieldName);
return new SketchBufferAggregator(selector, size, getMaxIntermediateSizeWithNulls());
}
@@ -115,9 +107,41 @@ public BufferAggregator factorizeBuffered(ColumnSelectorFactory metricFactory)
@Override
public VectorAggregator factorizeVector(VectorColumnSelectorFactory selectorFactory)
{
+ validateInputs(selectorFactory.getColumnCapabilities(fieldName));
return new SketchVectorAggregator(selectorFactory, fieldName, size, getMaxIntermediateSizeWithNulls());
}
+ /**
+ * Validates whether the aggregator supports the input column type.
+ * Unsupported column types are:
+ *