Merge remote-tracking branch 'apache/master' into windowing-fixes-tur…

…ned-to-null
apache · Oct 12, 2023 · 15dba6f · 15dba6f
2 parents ed4187d + 61ea9e0
commit 15dba6f
Show file tree

Hide file tree

Showing 182 changed files with 5,803 additions and 1,685 deletions.
diff --git a/docs/api-reference/dynamic-configuration-api.md b/docs/api-reference/dynamic-configuration-api.md
diff --git a/docs/api-reference/sql-api.md b/docs/api-reference/sql-api.md
@@ -367,8 +367,6 @@ The following table shows examples of how Druid returns the column names and dat
 
 ## Query from deep storage
 
-> Query from deep storage is an [experimental feature](../development/experimental.md).
-
 You can use the `sql/statements` endpoint to query segments that exist only in deep storage and are not loaded onto your Historical processes as determined by your load rules.
 
 Note that at least one segment of a datasource must be available on a Historical process so that the Broker can plan your query. A quick way to check if this is true is whether or not a datasource is visible in the Druid console.

diff --git a/docs/configuration/index.md b/docs/configuration/index.md
@@ -798,8 +798,9 @@ Support for 64-bit floating point columns was released in Druid 0.11.0, so if yo
 Prior to version 0.13.0, Druid string columns treated `''` and `null` values as interchangeable, and numeric columns were unable to represent `null` values, coercing `null` to `0`. Druid 0.13.0 introduced a mode which enabled SQL compatible null handling, allowing string columns to distinguish empty strings from nulls, and numeric columns to contain null rows.
 
 |Property|Description|Default|
-|---|---|---|
+|--------|-----------|-------|
 |`druid.generic.useDefaultValueForNull`|Set to `false` to store and query data in SQL compatible mode. When set to `true` (legacy mode), `null` values will be stored as `''` for string columns and `0` for numeric columns.|`false`|
+|`druid.generic.useThreeValueLogicForNativeFilters`|Set to `true` to use SQL compatible three-value logic when processing native Druid filters when `druid.generic.useDefaultValueForNull=false` and `druid.expressions.useStrictBooleans=true`. When set to `false` Druid uses 2 value logic for filter processing, even when `druid.generic.useDefaultValueForNull=false` and `druid.expressions.useStrictBooleans=true`. See [boolean handling](../querying/sql-data-types.md#boolean-logic) for more details|`true`|
 |`druid.generic.ignoreNullsForStringCardinality`|When set to `true`, `null` values will be ignored for the built-in cardinality aggregator over string columns. Set to `false` to include `null` values while estimating cardinality of only string columns using the built-in cardinality aggregator. This setting takes effect only when `druid.generic.useDefaultValueForNull` is set to `true` and is ignored in SQL compatibility mode. Additionally, empty strings (equivalent to null) are not counted when this is set to `true`. |`false`|
 This mode does have a storage size and query performance cost, see [segment documentation](../design/segments.md#handling-null-values) for more details.
 

diff --git a/docs/multi-stage-query/reference.md b/docs/multi-stage-query/reference.md
@@ -248,7 +248,7 @@ The following table lists the context parameters for the MSQ task engine:
 | `selectDestination` | SELECT<br /><br /> Controls where the final result of the select query is written. <br />Use `taskReport`(the default) to write select results to the task report. <b> This is not scalable since task reports size explodes for large results </b> <br/>Use `durableStorage` to write results to durable storage location. <b>For large results sets, its recommended to use `durableStorage` </b>. To configure durable storage see [`this`](#durable-storage) section.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | `taskReport` |
 | `waitTillSegmentsLoad` | INSERT, REPLACE<br /><br /> If set, the ingest query waits for the generated segment to be loaded before exiting, else the ingest query exits without waiting. The task and live reports contain the information about the status of loading segments if this flag is set. This will ensure that any future queries made after the ingestion exits will include results from the ingestion. The drawback is that the controller task will stall till the segments are loaded.                                                                                                                                                                                                                                                                                                                                                                      | `false` |
 | `includeSegmentSource` | SELECT, INSERT, REPLACE<br /><br /> Controls the sources, which will be queried for results in addition to the segments present on deep storage. Can be `NONE` or `REALTIME`. If this value is `NONE`, only non-realtime (published and used) segments will be downloaded from deep storage. If this value is `REALTIME`, results will also be included from realtime tasks.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | `NONE` |
-
+| `rowsPerPage` | SELECT<br /><br />The number of rows per page to target. The actual number of rows per page may be somewhat higher or lower than this number. In most cases, use the default.<br /> This property comes into effect only when `selectDestination` is set to `durableStorage`                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 100000 |
 ## Joins
 
 Joins in multi-stage queries use one of two algorithms based on what you set the [context parameter](#context-parameters) `sqlJoinAlgorithm` to: 

diff --git a/docs/querying/filters.md b/docs/querying/filters.md
@@ -33,6 +33,8 @@ sidebar_label: "Filters"
 A filter is a JSON object indicating which rows of data should be included in the computation for a query. It’s essentially the equivalent of the WHERE clause in SQL.
 Filters are commonly applied on dimensions, but can be applied on aggregated metrics, for example, see [Filtered aggregator](./aggregations.md#filtered-aggregator) and [Having filters](./having.md).
 
+By default, Druid uses SQL compatible three-value logic when filtering. See [Boolean logic](./sql-data-types.md#boolean-logic) for more details.
+
 Apache Druid supports the following types of filters.
 
 ## Selector filter

diff --git a/docs/querying/query-from-deep-storage.md b/docs/querying/query-from-deep-storage.md
@@ -22,8 +22,6 @@ title: "Query from deep storage"
   ~ under the License.
   -->
 
-> Query from deep storage is an [experimental feature](../development/experimental.md).
-
 Druid can query segments that are only stored in deep storage. Running a query from deep storage is slower than running queries from segments that are loaded on Historical processes, but it's a great tool for data that you either access infrequently or where the low latency results that typical Druid queries provide is not necessary. Queries from deep storage can increase the surface area of data available to query without requiring you to scale your Historical processes to accommodate more segments.
 
 ## Keep segments in deep storage only

diff --git a/docs/querying/sql-data-types.md b/docs/querying/sql-data-types.md
@@ -152,14 +152,14 @@ values are treated as zeroes. This was the default prior to Druid 28.0.0.
 
 ## Boolean logic
 
-The [`druid.expressions.useStrictBooleans`](../configuration/index.md#expression-processing-configurations)
-runtime property controls Druid's boolean logic mode. For the most SQL compliant behavior, set this to `true` (the default).
+By default, Druid uses [SQL three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic#SQL) for filter processing
+and boolean expression evaluation. This behavior relies on three settings:
 
-When `druid.expressions.useStrictBooleans = true`, Druid uses three-valued logic for
-[expressions](math-expr.md) evaluation, such as `expression` virtual columns or `expression` filters.
-However, even in this mode, Druid uses two-valued logic for filter types other than `expression`.
+*  [`druid.generic.useDefaultValueForNull`](../configuration/index.md#sql-compatible-null-handling) must be set to false (default), a runtime property which allows NULL values to exist in numeric columns and expressions, and string typed columns to distinguish between NULL and the empty string 
+*  [`druid.expressions.useStrictBooleans`](../configuration/index.md#expression-processing-configurations) must be set to true (default), a runtime property controls Druid's boolean logic mode for expressions, as well as coercing all expression boolean values to be represented with a 1 for true and 0 for false
+*  [`druid.generic.useThreeValueLogicForNativeFilters`](../configuration/index.md#sql-compatible-null-handling) must be set to true (default), a runtime property which decouples three-value logic handling from `druid.generic.useDefaultValueForNull` and `druid.expressions.useStrictBooleans` for backwards compatibility with older versions of Druid that did not fully support SQL compatible null value logic handling
 
-When `druid.expressions.useStrictBooleans = false` (legacy mode), Druid uses two-valued logic.
+If any of these settings is configured with a non-default value, Druid will use two-valued logic for non-expression based filters. Expression based filters are controlled independently with `druid.expressions.useStrictBooleans`, which if set to false Druid will use two-valued logic for expressions.
 
 ## Nested columns
 

diff --git a/docs/tutorials/tutorial-query-deep-storage.md b/docs/tutorials/tutorial-query-deep-storage.md
@@ -23,9 +23,6 @@ sidebar_label: "Query from deep storage"
   ~ under the License.
   -->
 
-
-> Query from deep storage is an [experimental feature](../development/experimental.md).
-
 Query from deep storage allows you to query segments that are stored only in deep storage, which provides lower costs than if you were to load everything onto Historical processes. The tradeoff is that queries from deep storage may take longer to complete. 
 
 This tutorial walks you through loading example data, configuring load rules so that not all the segments get loaded onto Historical processes, and querying data from deep storage.

diff --git a/...umns/src/main/java/org/apache/druid/segment/MapTypeMapVirtualColumnDimensionSelector.java b/...umns/src/main/java/org/apache/druid/segment/MapTypeMapVirtualColumnDimensionSelector.java
@@ -19,7 +19,7 @@
 
 package org.apache.druid.segment;
 
-import com.google.common.base.Predicate;
+import org.apache.druid.query.filter.DruidPredicateFactory;
 import org.apache.druid.query.filter.ValueMatcher;
 import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
 import org.apache.druid.segment.data.IndexedInts;
@@ -55,7 +55,7 @@ public ValueMatcher makeValueMatcher(@Nullable String value)
     return new ValueMatcher()
     {
       @Override
-      public boolean matches()
+      public boolean matches(boolean includeUnknown)
       {
         // Map column doesn't match with any string
         return false;
@@ -70,12 +70,12 @@ public void inspectRuntimeShape(RuntimeShapeInspector inspector)
   }
 
   @Override
-  public ValueMatcher makeValueMatcher(Predicate<String> predicate)
+  public ValueMatcher makeValueMatcher(DruidPredicateFactory predicateFactory)
   {
     return new ValueMatcher()
     {
       @Override
-      public boolean matches()
+      public boolean matches(boolean includeUnknown)
       {
         return false;
       }

diff --git a/...s/src/main/java/org/apache/druid/segment/StringTypeMapVirtualColumnDimensionSelector.java b/...s/src/main/java/org/apache/druid/segment/StringTypeMapVirtualColumnDimensionSelector.java
@@ -21,6 +21,7 @@
 
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicate;
+import org.apache.druid.query.filter.DruidPredicateFactory;
 import org.apache.druid.query.filter.ValueMatcher;
 import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
 import org.apache.druid.segment.data.IndexedInts;
@@ -68,9 +69,10 @@ public ValueMatcher makeValueMatcher(@Nullable String value)
     return new ValueMatcher()
     {
       @Override
-      public boolean matches()
+      public boolean matches(boolean includeUnknown)
       {
-        return Objects.equals(value, getObject());
+        final Object rowValue = getObject();
+        return (includeUnknown && rowValue == null) || Objects.equals(value, rowValue);
       }
 
       @Override
@@ -84,14 +86,17 @@ public void inspectRuntimeShape(RuntimeShapeInspector inspector)
   }
 
   @Override
-  public ValueMatcher makeValueMatcher(Predicate<String> predicate)
+  public ValueMatcher makeValueMatcher(DruidPredicateFactory predicateFactory)
   {
+    final Predicate<String> predicate = predicateFactory.makeStringPredicate();
     return new ValueMatcher()
     {
       @Override
-      public boolean matches()
+      public boolean matches(boolean includeUnknown)
       {
-        return predicate.apply((String) getObject());
+        final String rowValue = (String) getObject();
+        final boolean matchNull = includeUnknown && predicateFactory.isNullInputUnknown();
+        return (matchNull && rowValue == null) || predicate.apply(rowValue);
       }
 
       @Override