Merge remote-tracking branch 'upstream/branch-24.06' into update-acti…

…on-version
YanxuanLiu · May 10, 2024 · 027412b · 027412b
2 parents 280c07e + 4dee2f8
commit 027412b
Show file tree

Hide file tree

Showing 347 changed files with 6,048 additions and 1,098 deletions.
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -35,21 +35,17 @@ jobs:
     # This job only runs for pull request comments
     if: contains( '\
       abellina,\
-      andygrove,\
       anfeng,\
       firestarman,\
       GaryShen2008,\
-      jbrennan333, \
       jlowe,\
-      krajendrannv,\
       kuhushukla,\
       mythrocks,\
       nartal1,\
       nvdbaranec,\
       NvTimLiu,\
       razajafri,\
       revans2,\
-      rongou,\
       rwlee,\
       sameerz,\
       tgravescs,\
@@ -73,6 +69,7 @@ jobs:
       yinqingh,\
       parthosa,\
       liurenjie1024,\
+      binmahone,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/NOTICE b/NOTICE
@@ -48,6 +48,17 @@ The Apache Software Foundation (http://www.apache.org/).
 
 --------------------------------------------------------------------------------
 
+This project includes software from the Apache Gluten project
+(www.github.com/apache/incubator-gluten/).
+
+Apache Gluten (Incubating)
+Copyright (2024) The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+--------------------------------------------------------------------------------
+
 This project includes code from Kite, developed at Cloudera, Inc. with
 the following copyright notice:
 

diff --git a/README.md b/README.md
@@ -59,8 +59,8 @@ access to any of the memory that RMM is holding.
 The Qualification and Profiling tools have been moved to
 [nvidia/spark-rapids-tools](https://github.com/NVIDIA/spark-rapids-tools) repo.
 
-Please refer to [Qualification tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-qualification-tool.html)
-and [Profiling tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/spark-profiling-tool.html)
+Please refer to [Qualification tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/qualification/overview.html)
+and [Profiling tool documentation](https://docs.nvidia.com/spark-rapids/user-guide/latest/profiling/overview.html)
 for more details on how to use the tools.
 
 ## Dependency for External Projects

diff --git a/aggregator/pom.xml b/aggregator/pom.xml
@@ -728,6 +728,23 @@
                 </dependency>
             </dependencies>
         </profile>
+        <profile>
+            <id>release343</id>
+            <activation>
+                <property>
+                    <name>buildver</name>
+                    <value>343</value>
+                </property>
+            </activation>
+            <dependencies>
+                <dependency>
+                    <groupId>com.nvidia</groupId>
+                    <artifactId>rapids-4-spark-delta-24x_${scala.binary.version}</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>${spark.version.classifier}</classifier>
+                </dependency>
+            </dependencies>
+        </profile>
         <profile>
             <id>release350</id>
             <activation>

diff --git a/build/build-info b/build/build-info
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ set -e
 echo_build_properties() {
   echo version=$1
   echo cudf_version=$2
-  echo user=$USER
+  echo user=$(whoami)
   echo revision=$(git rev-parse HEAD)
   echo branch=$(git rev-parse --abbrev-ref HEAD)
   echo date=$(date -u +%Y-%m-%dT%H:%M:%SZ)

diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
@@ -609,6 +609,15 @@ abstract class DataGen(var conf: ColumnConf,
     this
   }
 
+  def setNullProbabilityRecursively(probability: Double): DataGen = {
+    this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
+    children.foreach {
+      case (_, dataGen) =>
+        dataGen.setNullProbabilityRecursively(probability)
+    }
+    this
+  }
+
   /**
    * Set a specific location to seed mapping for the value generation.
    */
@@ -672,6 +681,7 @@ abstract class DataGen(var conf: ColumnConf,
    * Get the default value generator for this specific data gen.
    */
   protected def getValGen: GeneratorFunction
+  def children: Seq[(String, DataGen)]
 
   /**
    * Get the final ready to use GeneratorFunction for the data generator.
@@ -823,6 +833,8 @@ class BooleanGen(conf: ColumnConf,
   override def dataType: DataType = BooleanType
 
   override protected def getValGen: GeneratorFunction = BooleanGenFunc()
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -878,6 +890,8 @@ class ByteGen(conf: ColumnConf,
     extends DataGen(conf, defaultValueRange) {
   override def getValGen: GeneratorFunction = ByteGenFunc()
   override def dataType: DataType = ByteType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -935,6 +949,8 @@ class ShortGen(conf: ColumnConf,
   override def getValGen: GeneratorFunction = ShortGenFunc()
 
   override def dataType: DataType = ShortType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -991,6 +1007,8 @@ class IntGen(conf: ColumnConf,
   override def getValGen: GeneratorFunction = IntGenFunc()
 
   override def dataType: DataType = IntegerType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -1045,6 +1063,8 @@ class LongGen(conf: ColumnConf,
   override def getValGen: GeneratorFunction = LongGenFunc()
 
   override def dataType: DataType = LongType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 case class Decimal32GenFunc(
@@ -1284,6 +1304,8 @@ class DecimalGen(dt: DecimalType,
       val max = DecimalGen.genMaxUnscaled(dt.precision)
       DecimalGenFunc(dt.precision, dt.scale, -max, max)
     }
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -1341,6 +1363,8 @@ class TimestampGen(conf: ColumnConf,
   override protected def getValGen: GeneratorFunction = TimestampGenFunc()
 
   override def dataType: DataType = TimestampType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 object BigDataGenConsts {
@@ -1418,6 +1442,8 @@ class DateGen(conf: ColumnConf,
   override protected def getValGen: GeneratorFunction = DateGenFunc()
 
   override def dataType: DataType = DateType
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -1440,6 +1466,8 @@ class DoubleGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def dataType: DataType = DoubleType
 
   override protected def getValGen: GeneratorFunction = DoubleGenFunc()
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 /**
@@ -1462,6 +1490,8 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def dataType: DataType = FloatType
 
   override protected def getValGen: GeneratorFunction = FloatGenFunc()
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 trait JSONType {
@@ -1648,6 +1678,8 @@ class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def dataType: DataType = StringType
 
   override protected def getValGen: GeneratorFunction = ASCIIGenFunc()
+
+  override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
 case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction {
@@ -1752,6 +1784,8 @@ class ArrayGen(child: DataGen,
       None
     }
   }
+
+  override def children: Seq[(String, DataGen)] = Seq(("data", child))
 }
 
 case class MapGenFunc(
@@ -1816,6 +1850,8 @@ class MapGen(key: DataGen,
       None
     }
   }
+
+  override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value))
 }
 
 
@@ -1864,6 +1900,11 @@ class ColumnGen(val dataGen: DataGen) {
     this
   }
 
+  def setNullProbabilityRecursively(probability: Double): ColumnGen = {
+    dataGen.setNullProbabilityRecursively(probability)
+    this
+  }
+
   def setNullGen(f: NullGeneratorFunction): ColumnGen = {
     dataGen.setNullGen(f)
     this
@@ -1973,6 +2014,14 @@ class TableGen(val columns: Seq[(String, ColumnGen)], numRows: Long) {
     this
   }
 
+  def setNullProbabilityRecursively(probability: Double): TableGen = {
+    columns.foreach {
+      case (_, columnGen) =>
+        columnGen.setNullProbabilityRecursively(probability)
+    }
+    this
+  }
+
   /**
    * Convert this table into a `DataFrame` that can be
    * written out or used directly. Writing it out to parquet

diff --git a/.../src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala b/.../src/main/spark320/scala/org/apache/spark/sql/tests/datagen/datagen/DataGenExprBase.scala
@@ -34,8 +34,10 @@
 {"spark": "341"}
 {"spark": "341db"}
 {"spark": "342"}
+{"spark": "343"}
 {"spark": "350"}
 {"spark": "351"}
+{"spark": "400"}
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.tests.datagen
 

diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md
@@ -129,12 +129,12 @@ Name | Description | Default Value | Applicable at
 <a name="sql.json.read.decimal.enabled"></a>spark.rapids.sql.json.read.decimal.enabled|When reading a quoted string as a decimal Spark supports reading non-ascii unicode digits, and the RAPIDS Accelerator does not.|true|Runtime
 <a name="sql.json.read.double.enabled"></a>spark.rapids.sql.json.read.double.enabled|JSON reading is not 100% compatible when reading doubles.|true|Runtime
 <a name="sql.json.read.float.enabled"></a>spark.rapids.sql.json.read.float.enabled|JSON reading is not 100% compatible when reading floats.|true|Runtime
-<a name="sql.json.read.mixedTypesAsString.enabled"></a>spark.rapids.sql.json.read.mixedTypesAsString.enabled|JSON reading is not 100% compatible when reading mixed types as string.|false|Runtime
 <a name="sql.mode"></a>spark.rapids.sql.mode|Set the mode for the Rapids Accelerator. The supported modes are explainOnly and executeOnGPU. This config can not be changed at runtime, you must restart the application for it to take affect. The default mode is executeOnGPU, which means the RAPIDS Accelerator plugin convert the Spark operations and execute them on the GPU when possible. The explainOnly mode allows running queries on the CPU and the RAPIDS Accelerator will evaluate the queries as if it was going to run on the GPU. The explanations of what would have run on the GPU and why are output in log messages. When using explainOnly mode, the default explain output is ALL, this can be changed by setting spark.rapids.sql.explain. See that config for more details.|executeongpu|Startup
 <a name="sql.optimizer.joinReorder.enabled"></a>spark.rapids.sql.optimizer.joinReorder.enabled|When enabled, joins may be reordered for improved query performance|true|Runtime
 <a name="sql.python.gpu.enabled"></a>spark.rapids.sql.python.gpu.enabled|This is an experimental feature and is likely to change in the future. Enable (true) or disable (false) support for scheduling Python Pandas UDFs with GPU resources. When enabled, pandas UDFs are assumed to share the same GPU that the RAPIDs accelerator uses and will honor the python GPU configs|false|Runtime
-<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation.|true|Runtime
-<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently this only works for parquet.|true|Runtime
+<a name="sql.reader.chunked"></a>spark.rapids.sql.reader.chunked|Enable a chunked reader where possible. A chunked reader allows reading highly compressed data that could not be read otherwise, but at the expense of more GPU memory, and in some cases more GPU computation. Currently this only supports ORC and Parquet formats.|true|Runtime
+<a name="sql.reader.chunked.limitMemoryUsage"></a>spark.rapids.sql.reader.chunked.limitMemoryUsage|Enable a soft limit on the internal memory usage of the chunked reader (if being used). Such limit is calculated as the multiplication of 'spark.rapids.sql.batchSizeBytes' and 'spark.rapids.sql.reader.chunked.memoryUsageRatio'.For example, if batchSizeBytes is set to 1GB and memoryUsageRatio is 4, the chunked reader will try to keep its memory usage under 4GB.|None|Runtime
+<a name="sql.reader.chunked.subPage"></a>spark.rapids.sql.reader.chunked.subPage|Enable a chunked reader where possible for reading data that is smaller than the typical row group/page limit. Currently deprecated and replaced by 'spark.rapids.sql.reader.chunked.limitMemoryUsage'.|None|Runtime
 <a name="sql.reader.multithreaded.combine.sizeBytes"></a>spark.rapids.sql.reader.multithreaded.combine.sizeBytes|The target size in bytes to combine multiple small files together when using the MULTITHREADED parquet or orc reader. With combine disabled, the MULTITHREADED reader reads the files in parallel and sends individual files down to the GPU, but that can be inefficient for small files. When combine is enabled, files that are ready within spark.rapids.sql.reader.multithreaded.combine.waitTime together, up to this threshold size, are combined before sending down to GPU. This can be disabled by setting it to 0. Note that combine also will not go over the spark.rapids.sql.reader.batchSizeRows or spark.rapids.sql.reader.batchSizeBytes limits.|67108864|Runtime
 <a name="sql.reader.multithreaded.combine.waitTime"></a>spark.rapids.sql.reader.multithreaded.combine.waitTime|When using the multithreaded parquet or orc reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven't met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this.|200|Runtime
 <a name="sql.reader.multithreaded.read.keepOrder"></a>spark.rapids.sql.reader.multithreaded.read.keepOrder|When using the MULTITHREADED reader, if this is set to true we read the files in the same order Spark does, otherwise the order may not be the same. Now it is supported only for parquet and orc.|true|Runtime
@@ -184,6 +184,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.ArrayContains"></a>spark.rapids.sql.expression.ArrayContains|`array_contains`|Returns a boolean if the array contains the passed in key|true|None|
 <a name="sql.expression.ArrayExcept"></a>spark.rapids.sql.expression.ArrayExcept|`array_except`|Returns an array of the elements in array1 but not in array2, without duplicates|true|This is not 100% compatible with the Spark version because the GPU implementation treats -0.0 and 0.0 as equal, but the CPU implementation currently does not (see SPARK-39845). Also, Apache Spark 3.1.3 fixed issue SPARK-36741 where NaNs in these set like operators were not treated as being equal. We have chosen to break with compatibility for the older versions of Spark in this instance and handle NaNs the same as 3.1.3+|
 <a name="sql.expression.ArrayExists"></a>spark.rapids.sql.expression.ArrayExists|`exists`|Return true if any element satisfies the predicate LambdaFunction|true|None|
+<a name="sql.expression.ArrayFilter"></a>spark.rapids.sql.expression.ArrayFilter|`filter`|Filter an input array using a given predicate|true|None|
 <a name="sql.expression.ArrayIntersect"></a>spark.rapids.sql.expression.ArrayIntersect|`array_intersect`|Returns an array of the elements in the intersection of array1 and array2, without duplicates|true|This is not 100% compatible with the Spark version because the GPU implementation treats -0.0 and 0.0 as equal, but the CPU implementation currently does not (see SPARK-39845). Also, Apache Spark 3.1.3 fixed issue SPARK-36741 where NaNs in these set like operators were not treated as being equal. We have chosen to break with compatibility for the older versions of Spark in this instance and handle NaNs the same as 3.1.3+|
 <a name="sql.expression.ArrayMax"></a>spark.rapids.sql.expression.ArrayMax|`array_max`|Returns the maximum value in the array|true|None|
 <a name="sql.expression.ArrayMin"></a>spark.rapids.sql.expression.ArrayMin|`array_min`|Returns the minimum value in the array|true|None|
@@ -248,7 +249,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.FromUnixTime"></a>spark.rapids.sql.expression.FromUnixTime|`from_unixtime`|Get the string from a unix timestamp|true|None|
 <a name="sql.expression.GetArrayItem"></a>spark.rapids.sql.expression.GetArrayItem| |Gets the field at `ordinal` in the Array|true|None|
 <a name="sql.expression.GetArrayStructFields"></a>spark.rapids.sql.expression.GetArrayStructFields| |Extracts the `ordinal`-th fields of all array elements for the data with the type of array of struct|true|None|
-<a name="sql.expression.GetJsonObject"></a>spark.rapids.sql.expression.GetJsonObject|`get_json_object`|Extracts a json object from path|true|None|
+<a name="sql.expression.GetJsonObject"></a>spark.rapids.sql.expression.GetJsonObject|`get_json_object`|Extracts a json object from path|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.|
 <a name="sql.expression.GetMapValue"></a>spark.rapids.sql.expression.GetMapValue| |Gets Value from a Map based on a key|true|None|
 <a name="sql.expression.GetStructField"></a>spark.rapids.sql.expression.GetStructField| |Gets the named field of the struct|true|None|
 <a name="sql.expression.GetTimestamp"></a>spark.rapids.sql.expression.GetTimestamp| |Gets timestamps from strings using given pattern.|true|None|
@@ -269,7 +270,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None|
 <a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None|
 <a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case|
-<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because JsonTuple on the GPU does not support all of the normalization that the CPU supports.|
+<a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.|
 <a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None|
 <a name="sql.expression.KnownNotNull"></a>spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None|
 <a name="sql.expression.Lag"></a>spark.rapids.sql.expression.Lag|`lag`|Window function that returns N entries behind this one|true|None|

diff --git a/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_no_rdma b/docs/additional-functionality/shuffle-docker-examples/Dockerfile.rocky_no_rdma
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #   - ROCKY_VER: Rocky Linux OS version
 
 ARG CUDA_VER=11.8.0
-ARG UCX_VER=1.15.0
+ARG UCX_VER=1.16.0
 ARG UCX_CUDA_VER=11
 ARG UCX_ARCH=x86_64
 ARG ROCKY_VER=8
@@ -38,6 +38,5 @@ RUN ls /usr/lib
 RUN mkdir /tmp/ucx_install && cd /tmp/ucx_install && \
   wget https://github.com/openucx/ucx/releases/download/v$UCX_VER/ucx-$UCX_VER-centos8-mofed5-cuda$UCX_CUDA_VER-$UCX_ARCH.tar.bz2 && \
   tar -xvf *.bz2 && \
-  rpm -i ucx-$UCX_VER*.rpm && \
-  rpm -i ucx-cuda-$UCX_VER*.rpm --nodeps && \
+  rpm -i `ls ucx-[0-9]*.rpm ucx-cuda-[0-9]*.rpm` --nodeps && \
   rm -rf /tmp/ucx_install