From 5068921bd54ae60aed16f8df129278796c840fe8 Mon Sep 17 00:00:00 2001
From: nvauto <70000568+nvauto@users.noreply.github.com>
Date: Mon, 25 Nov 2024 06:15:06 +0000
Subject: [PATCH 01/37] Init version 25.02.0-SNAPSHOT

Keep the rapids JNI and private dependency version at 24.12.0-SNAPSHOT until the nightly CI for the branch-25.02 branch is complete. Track the dependency update process at: https://github.com/NVIDIA/spark-rapids/issues/11755

Signed-off-by: nvauto <70000568+nvauto@users.noreply.github.com>
---
 CONTRIBUTING.md                                      |  8 ++++----
 README.md                                            |  2 +-
 aggregator/pom.xml                                   |  4 ++--
 api_validation/pom.xml                               |  4 ++--
 datagen/README.md                                    |  6 +++---
 datagen/ScaleTest.md                                 |  2 +-
 datagen/pom.xml                                      |  4 ++--
 delta-lake/delta-20x/pom.xml                         |  4 ++--
 delta-lake/delta-21x/pom.xml                         |  4 ++--
 delta-lake/delta-22x/pom.xml                         |  4 ++--
 delta-lake/delta-23x/pom.xml                         |  4 ++--
 delta-lake/delta-24x/pom.xml                         |  4 ++--
 delta-lake/delta-spark330db/pom.xml                  |  4 ++--
 delta-lake/delta-spark332db/pom.xml                  |  4 ++--
 delta-lake/delta-spark341db/pom.xml                  |  4 ++--
 delta-lake/delta-spark350db143/pom.xml               |  4 ++--
 delta-lake/delta-stub/pom.xml                        |  4 ++--
 dist/pom.xml                                         |  4 ++--
 docs/configs.md                                      |  2 +-
 docs/dev/shims.md                                    | 12 ++++++------
 docs/dev/testing.md                                  |  4 ++--
 integration_tests/README.md                          |  6 +++---
 integration_tests/ScaleTest.md                       |  2 +-
 integration_tests/pom.xml                            |  4 ++--
 jdk-profiles/pom.xml                                 |  4 ++--
 jenkins/databricks/create.py                         |  2 +-
 jenkins/databricks/init_cudf_udf.sh                  |  1 +
 jenkins/version-def.sh                               |  4 ++--
 pom.xml                                              |  3 ++-
 scala2.13/aggregator/pom.xml                         |  4 ++--
 scala2.13/api_validation/pom.xml                     |  4 ++--
 scala2.13/datagen/pom.xml                            |  4 ++--
 scala2.13/delta-lake/delta-20x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-21x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-22x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-23x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-24x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-spark330db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark332db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark341db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark350db143/pom.xml     |  4 ++--
 scala2.13/delta-lake/delta-stub/pom.xml              |  4 ++--
 scala2.13/dist/pom.xml                               |  4 ++--
 scala2.13/integration_tests/pom.xml                  |  4 ++--
 scala2.13/jdk-profiles/pom.xml                       |  4 ++--
 scala2.13/pom.xml                                    |  3 ++-
 scala2.13/shim-deps/cloudera/pom.xml                 |  4 ++--
 scala2.13/shim-deps/databricks/pom.xml               |  4 ++--
 scala2.13/shim-deps/pom.xml                          |  4 ++--
 scala2.13/shuffle-plugin/pom.xml                     |  4 ++--
 scala2.13/sql-plugin-api/pom.xml                     |  4 ++--
 scala2.13/sql-plugin/pom.xml                         |  4 ++--
 scala2.13/tests/pom.xml                              |  4 ++--
 scala2.13/tools/pom.xml                              |  4 ++--
 scala2.13/udf-compiler/pom.xml                       |  4 ++--
 shim-deps/cloudera/pom.xml                           |  4 ++--
 shim-deps/databricks/pom.xml                         |  4 ++--
 shim-deps/pom.xml                                    |  4 ++--
 shuffle-plugin/pom.xml                               |  4 ++--
 sql-plugin-api/pom.xml                               |  4 ++--
 .../scala/com/nvidia/spark/rapids/ShimLoader.scala   |  8 ++++----
 sql-plugin/pom.xml                                   |  4 ++--
 .../main/scala/com/nvidia/spark/rapids/Plugin.scala  |  6 +++---
 .../scala/com/nvidia/spark/rapids/RapidsConf.scala   |  2 +-
 tests/pom.xml                                        |  4 ++--
 tools/pom.xml                                        |  4 ++--
 udf-compiler/pom.xml                                 |  4 ++--
 67 files changed, 138 insertions(+), 135 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 83b30747abd..e4077ee5994 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests
 Verify that shim-specific classes are hidden from a conventional classloader.
 
 ```bash
-$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl
+$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl
 Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl
 ```
 
 However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name
 
 ```bash
-$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2
-Warning: File dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl
+$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2
+Warning: File dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl
 Compiled from "SparkShims.scala"
 public final class com.nvidia.spark.rapids.shims.SparkShimImpl {
 ```
@@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true
 Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable:
 
 ```bash
-$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2
+$ javap -cp dist/target/rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2
 Compiled from "SparkShims.scala"
 public final class com.nvidia.spark.rapids.shims.SparkShimImpl {
 ```
diff --git a/README.md b/README.md
index 94b73565190..61914e49df0 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <scope>provided</scope>
 </dependency>
 ```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index c7a6c220247..a47745776bc 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/api_validation/pom.xml b/api_validation/pom.xml
index cddcf0c1ce1..f3339375806 100644
--- a/api_validation/pom.xml
+++ b/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.12</artifactId>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/datagen/README.md b/datagen/README.md
index 022cc2f1eba..1c49c8db58e 100644
--- a/datagen/README.md
+++ b/datagen/README.md
@@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0.
 
 After this the jar should be at
 `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar`
-for example a Spark 3.3.0 jar for the 24.12.0 release would be
-`target/datagen_2.12-24.12.0-spark330.jar`
+for example a Spark 3.3.0 jar for the 25.02.0 release would be
+`target/datagen_2.12-25.02.0-spark330.jar`
 
 To get a spark shell with this you can run
 ```shell
-spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar
+spark-shell --jars target/datagen_2.12-25.02.0-spark330.jar
 ```
 
 After that you should be good to go.
diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md
index a728ad9a13e..8e692173f5f 100644
--- a/datagen/ScaleTest.md
+++ b/datagen/ScaleTest.md
@@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME
-./target/datagen_2.12-24.12.0-SNAPSHOT-spark332.jar \
+./target/datagen_2.12-25.02.0-SNAPSHOT-spark332.jar \
 1 \
 10 \
 parquet \
diff --git a/datagen/pom.xml b/datagen/pom.xml
index 9bdf897cfd7..fc2d8bc677c 100644
--- a/datagen/pom.xml
+++ b/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.12</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml
index 1d41911c767..ba5443a7be2 100644
--- a/delta-lake/delta-20x/pom.xml
+++ b/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml
index 7514088ca3a..602686d79ab 100644
--- a/delta-lake/delta-21x/pom.xml
+++ b/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml
index 2ed0ea3b159..7867c573607 100644
--- a/delta-lake/delta-22x/pom.xml
+++ b/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml
index 1daebdd0efb..f537de0be36 100644
--- a/delta-lake/delta-23x/pom.xml
+++ b/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml
index 36ec92b70c0..443681b6cb3 100644
--- a/delta-lake/delta-24x/pom.xml
+++ b/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml
index 95f54c6807c..4812c9d0097 100644
--- a/delta-lake/delta-spark330db/pom.xml
+++ b/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml
index 4d792ee1ca5..306553caa43 100644
--- a/delta-lake/delta-spark332db/pom.xml
+++ b/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml
index 4b229e2e5b5..c7b4a4e2738 100644
--- a/delta-lake/delta-spark341db/pom.xml
+++ b/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/delta-lake/delta-spark350db143/pom.xml b/delta-lake/delta-spark350db143/pom.xml
index 1bca394b67c..1e166244e1e 100644
--- a/delta-lake/delta-spark350db143/pom.xml
+++ b/delta-lake/delta-spark350db143/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark350db143_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml
index 6d0471f9f01..31b8e03b366 100644
--- a/delta-lake/delta-stub/pom.xml
+++ b/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/dist/pom.xml b/dist/pom.xml
index d628dd4ba3b..b34292a25cd 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/docs/configs.md b/docs/configs.md
index 7f9544496c4..04aecb41f02 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.concurrentGpuTasks=2
 ```
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index 0d62eb4cae8..24252df607e 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
 Spark 3.0.2's URLs:
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark302/
 ```
 
 Spark 3.2.0's URLs :
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/
-jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/
+jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark320/
 ```
 
 ### Late Inheritance in Public Classes
diff --git a/docs/dev/testing.md b/docs/dev/testing.md
index 9f1c33091f1..fe6c0b94c1f 100644
--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -5,5 +5,5 @@ nav_order: 2
 parent: Developer Overview
 ---
 An overview of testing can be found within the repository at:
-* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/tests#readme)
-* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/integration_tests#readme)
+* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-25.02/tests#readme)
+* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-25.02/integration_tests#readme)
diff --git a/integration_tests/README.md b/integration_tests/README.md
index f5237de21a0..031b318bddf 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -263,7 +263,7 @@ individually, so you don't risk running unit tests along with the integration te
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell
-spark-shell --jars rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-25.02.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-25.02.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -286,7 +286,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -443,7 +443,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-25.02.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
 ```
 
 ### Enabling fuzz tests
diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md
index d9f47fab5cb..8b91331abc9 100644
--- a/integration_tests/ScaleTest.md
+++ b/integration_tests/ScaleTest.md
@@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \
 --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \
-./target/rapids-4-spark-integration-tests_2.12-24.12.0-SNAPSHOT-spark332.jar \
+./target/rapids-4-spark-integration-tests_2.12-25.02.0-SNAPSHOT-spark332.jar \
 10 \
 100 \
 parquet \
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index aaff3455298..6054c30c795 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.12</artifactId>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml
index caaa47245a8..b45da24bd58 100644
--- a/jdk-profiles/pom.xml
+++ b/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk8</id>
diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
index 44c4c856466..7815f170dfb 100644
--- a/jenkins/databricks/create.py
+++ b/jenkins/databricks/create.py
@@ -27,7 +27,7 @@ def main():
   workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
   token = ''
   sshkey = ''
-  cluster_name = 'CI-GPU-databricks-24.12.0-SNAPSHOT'
+  cluster_name = 'CI-GPU-databricks-25.02.0-SNAPSHOT'
   idletime = 240
   runtime = '13.3.x-gpu-ml-scala2.12'
   num_workers = 1
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 16b90b95c0e..94ca7473143 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -20,6 +20,7 @@
 
 set -ex
 
+# TODO: https://github.com/NVIDIA/spark-rapids/issues/11755
 CUDF_VER=${CUDF_VER:-24.12}
 CUDA_VER=${CUDA_VER:-11.8}
 
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index 8600a2f8689..6c9a9fac4cb 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -29,8 +29,8 @@ IFS=$PRE_IFS
 
 CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
 CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility
-PROJECT_VER=${PROJECT_VER:-"24.12.0-SNAPSHOT"}
-PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.12.0-SNAPSHOT"}
+PROJECT_VER=${PROJECT_VER:-"25.02.0-SNAPSHOT"}
+PROJECT_TEST_VER=${PROJECT_TEST_VER:-"25.02.0-SNAPSHOT"}
 SPARK_VER=${SPARK_VER:-"3.2.0"}
 SPARK_VER_213=${SPARK_VER_213:-"3.3.0"}
 # Make a best attempt to set the default value for the shuffle shim.
diff --git a/pom.xml b/pom.xml
index 12828404031..79a6a765470 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -829,6 +829,7 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
+        <!-- TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 -->
         <spark-rapids-jni.version>24.12.0-SNAPSHOT</spark-rapids-jni.version>
         <spark-rapids-private.version>24.12.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml
index 74956108068..d9db5bcf14e 100644
--- a/scala2.13/aggregator/pom.xml
+++ b/scala2.13/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml
index f236345c301..2a2e08c6071 100644
--- a/scala2.13/api_validation/pom.xml
+++ b/scala2.13/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.13</artifactId>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml
index d53ebc014c7..e8a07a79841 100644
--- a/scala2.13/datagen/pom.xml
+++ b/scala2.13/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.13</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml
index 20c77038f40..57551a3d164 100644
--- a/scala2.13/delta-lake/delta-20x/pom.xml
+++ b/scala2.13/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml
index 75a41cfa8e0..6fbcf6bb8d1 100644
--- a/scala2.13/delta-lake/delta-21x/pom.xml
+++ b/scala2.13/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml
index c6111eb51a0..ff919c7b48c 100644
--- a/scala2.13/delta-lake/delta-22x/pom.xml
+++ b/scala2.13/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml
index 84d1d7275c2..fe927c7a092 100644
--- a/scala2.13/delta-lake/delta-23x/pom.xml
+++ b/scala2.13/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml
index 0ffe6c84e10..781f7975523 100644
--- a/scala2.13/delta-lake/delta-24x/pom.xml
+++ b/scala2.13/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml
index 3c30b1b0dc8..d6f2ee68e10 100644
--- a/scala2.13/delta-lake/delta-spark330db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml
index a3501c1003c..de53ab84f32 100644
--- a/scala2.13/delta-lake/delta-spark332db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml
index c740362b11f..bd6a72ea04b 100644
--- a/scala2.13/delta-lake/delta-spark341db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/delta-lake/delta-spark350db143/pom.xml b/scala2.13/delta-lake/delta-spark350db143/pom.xml
index d6046b64578..c19c2e0ad21 100644
--- a/scala2.13/delta-lake/delta-spark350db143/pom.xml
+++ b/scala2.13/delta-lake/delta-spark350db143/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark350db143_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml
index 2f90b85acd7..2a334190cea 100644
--- a/scala2.13/delta-lake/delta-stub/pom.xml
+++ b/scala2.13/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml
index 15df1ec69f8..0c8f12a9214 100644
--- a/scala2.13/dist/pom.xml
+++ b/scala2.13/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml
index 88ab2531235..67afb46c779 100644
--- a/scala2.13/integration_tests/pom.xml
+++ b/scala2.13/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.13</artifactId>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml
index 793bf0fb327..6ec2f369b96 100644
--- a/scala2.13/jdk-profiles/pom.xml
+++ b/scala2.13/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk8</id>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index d52c8658423..d1368d81d97 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -829,6 +829,7 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
+        <!-- TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 -->
         <spark-rapids-jni.version>24.12.0-SNAPSHOT</spark-rapids-jni.version>
         <spark-rapids-private.version>24.12.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.13</scala.binary.version>
diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml
index 95c49a2b1ca..be06f76c136 100644
--- a/scala2.13/shim-deps/cloudera/pom.xml
+++ b/scala2.13/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml
index 9d6ff787ef1..4feb4045327 100644
--- a/scala2.13/shim-deps/databricks/pom.xml
+++ b/scala2.13/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/scala2.13/shim-deps/pom.xml b/scala2.13/shim-deps/pom.xml
index 66cfa22afea..6c7a4b991a7 100644
--- a/scala2.13/shim-deps/pom.xml
+++ b/scala2.13/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml
index b9e76b2f068..6f915a66212 100644
--- a/scala2.13/shuffle-plugin/pom.xml
+++ b/scala2.13/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-      <version>24.12.0-SNAPSHOT</version>
+      <version>25.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml
index 3c48d7c13f2..91802901fc0 100644
--- a/scala2.13/sql-plugin-api/pom.xml
+++ b/scala2.13/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.13</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml
index b96e1517690..e0ceea49776 100644
--- a/scala2.13/sql-plugin/pom.xml
+++ b/scala2.13/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/scala2.13/tests/pom.xml b/scala2.13/tests/pom.xml
index 377dc4671fb..6aa80019d27 100644
--- a/scala2.13/tests/pom.xml
+++ b/scala2.13/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/scala2.13/tools/pom.xml b/scala2.13/tools/pom.xml
index 0c3179e09ff..866987242ae 100644
--- a/scala2.13/tools/pom.xml
+++ b/scala2.13/tools/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tools-support</artifactId>
     <packaging>pom</packaging>
     <name>RAPIDS Accelerator for Apache Spark Tools Support</name>
     <description>Supporting code for RAPIDS Accelerator tools</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml
index 10ad46a48aa..09cce00ef27 100644
--- a/scala2.13/udf-compiler/pom.xml
+++ b/scala2.13/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>
diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml
index a9b71366927..0505fed6bac 100644
--- a/shim-deps/cloudera/pom.xml
+++ b/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml
index edfa3d6f896..82618ba65cd 100644
--- a/shim-deps/databricks/pom.xml
+++ b/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/shim-deps/pom.xml b/shim-deps/pom.xml
index d90dfc34190..e5a047f5169 100644
--- a/shim-deps/pom.xml
+++ b/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index 69d8f1b765b..fd92b6b0957 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-      <version>24.12.0-SNAPSHOT</version>
+      <version>25.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/sql-plugin-api/pom.xml b/sql-plugin-api/pom.xml
index 090a809fc05..b1080ef7d39 100644
--- a/sql-plugin-api/pom.xml
+++ b/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.12</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
index 23a9ece7468..533fee141c5 100644
--- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
+++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -48,11 +48,11 @@ import org.apache.spark.util.MutableURLClassLoader
     Each shim can see a consistent parallel world without conflicts by referencing
     only one conflicting directory.
     E.g., Spark 3.2.0 Shim will use
-    jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/
-    jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/
+    jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/
+    jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark320/
     Spark 3.3.1 will use
-    jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/
-    jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark331/
+    jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark-shared/
+    jar:file:/home/spark/rapids-4-spark_2.12-25.02.0.jar!/spark331/
     Using these Jar URL's allows referencing different bytecode produced from identical sources
     by incompatible Scala / Spark dependencies.
  */
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index c9cfb8ce99f..2b0a62a5b90 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index e20b21da520..331835a6634 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -133,11 +133,11 @@ object RapidsPluginUtils extends Logging {
     val possibleRapidsJarURLs = classloader.getResources(propName).asScala.toSet.toSeq.filter {
       url => {
         val urlPath = url.toString
-        // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-24.12.0-spark341.jar,
+        // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-25.02.0-spark341.jar,
         // and files stored under subdirs of '!/', e.g.
-        // rapids-4-spark_2.12-24.12.0-cuda11.jar!/spark330/rapids4spark-version-info.properties
+        // rapids-4-spark_2.12-25.02.0-cuda11.jar!/spark330/rapids4spark-version-info.properties
         // We only want to find the main jar, e.g.
-        // rapids-4-spark_2.12-24.12.0-cuda11.jar!/rapids4spark-version-info.properties
+        // rapids-4-spark_2.12-25.02.0-cuda11.jar!/rapids4spark-version-info.properties
         !urlPath.contains("rapids-4-spark-") && urlPath.endsWith("!/" + propName)
       }
     }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index e279385be82..b77beb2e2bd 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -2441,7 +2441,7 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression.
         |On startup use: `--conf [conf key]=[conf value]`. For example:
         |
         |```
-        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-SNAPSHOT-cuda11.jar \
+        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-25.02.0-SNAPSHOT-cuda11.jar \
         |--conf spark.plugins=com.nvidia.spark.SQLPlugin \
         |--conf spark.rapids.sql.concurrentGpuTasks=2
         |```
diff --git a/tests/pom.xml b/tests/pom.xml
index a8fef6b7930..bc67c0c3de0 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/tools/pom.xml b/tools/pom.xml
index df919f112ef..23bae1bcd8d 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tools-support</artifactId>
     <packaging>pom</packaging>
     <name>RAPIDS Accelerator for Apache Spark Tools Support</name>
     <description>Supporting code for RAPIDS Accelerator tools</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml
index afe827baf78..a32c1d3813f 100644
--- a/udf-compiler/pom.xml
+++ b/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>24.12.0-SNAPSHOT</version>
+        <version>25.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>24.12.0-SNAPSHOT</version>
+    <version>25.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>

From 65394412f54f003c5be7b1a572a8e38164a5f025 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 25 Nov 2024 12:57:40 -0600
Subject: [PATCH 02/37] Enable JSON Scan and from_json by default (#11753)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 .../advanced_configs.md                       |   6 +-
 docs/compatibility.md                         | 161 ++++++++----------
 docs/supported_ops.md                         |   4 +-
 .../nvidia/spark/rapids/GpuOverrides.scala    |   8 +-
 .../com/nvidia/spark/rapids/RapidsConf.scala  |   4 +-
 .../320/supportedDataSource.csv               |   2 +-
 tools/generated_files/320/supportedExprs.csv  |   4 +-
 .../321/supportedDataSource.csv               |   2 +-
 tools/generated_files/321/supportedExprs.csv  |   4 +-
 .../321cdh/supportedDataSource.csv            |   2 +-
 .../generated_files/321cdh/supportedExprs.csv |   4 +-
 .../322/supportedDataSource.csv               |   2 +-
 tools/generated_files/322/supportedExprs.csv  |   4 +-
 .../323/supportedDataSource.csv               |   2 +-
 tools/generated_files/323/supportedExprs.csv  |   4 +-
 .../324/supportedDataSource.csv               |   2 +-
 tools/generated_files/324/supportedExprs.csv  |   4 +-
 .../330/supportedDataSource.csv               |   2 +-
 tools/generated_files/330/supportedExprs.csv  |   4 +-
 .../330cdh/supportedDataSource.csv            |   2 +-
 .../generated_files/330cdh/supportedExprs.csv |   4 +-
 .../331/supportedDataSource.csv               |   2 +-
 tools/generated_files/331/supportedExprs.csv  |   4 +-
 .../332/supportedDataSource.csv               |   2 +-
 tools/generated_files/332/supportedExprs.csv  |   4 +-
 .../332cdh/supportedDataSource.csv            |   2 +-
 .../generated_files/332cdh/supportedExprs.csv |   4 +-
 .../333/supportedDataSource.csv               |   2 +-
 tools/generated_files/333/supportedExprs.csv  |   4 +-
 .../334/supportedDataSource.csv               |   2 +-
 tools/generated_files/334/supportedExprs.csv  |   4 +-
 .../340/supportedDataSource.csv               |   2 +-
 tools/generated_files/340/supportedExprs.csv  |   4 +-
 .../341/supportedDataSource.csv               |   2 +-
 tools/generated_files/341/supportedExprs.csv  |   4 +-
 .../342/supportedDataSource.csv               |   2 +-
 tools/generated_files/342/supportedExprs.csv  |   4 +-
 .../343/supportedDataSource.csv               |   2 +-
 tools/generated_files/343/supportedExprs.csv  |   4 +-
 .../344/supportedDataSource.csv               |   2 +-
 tools/generated_files/344/supportedExprs.csv  |   4 +-
 .../350/supportedDataSource.csv               |   2 +-
 tools/generated_files/350/supportedExprs.csv  |   4 +-
 .../351/supportedDataSource.csv               |   2 +-
 tools/generated_files/351/supportedExprs.csv  |   4 +-
 .../352/supportedDataSource.csv               |   2 +-
 tools/generated_files/352/supportedExprs.csv  |   4 +-
 .../353/supportedDataSource.csv               |   2 +-
 tools/generated_files/353/supportedExprs.csv  |   4 +-
 .../400/supportedDataSource.csv               |   2 +-
 tools/generated_files/400/supportedExprs.csv  |   4 +-
 tools/generated_files/supportedDataSource.csv |   2 +-
 tools/generated_files/supportedExprs.csv      |   4 +-
 53 files changed, 151 insertions(+), 176 deletions(-)

diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md
index f3157b46099..07346a5b850 100644
--- a/docs/additional-functionality/advanced_configs.md
+++ b/docs/additional-functionality/advanced_configs.md
@@ -95,8 +95,8 @@ Name | Description | Default Value | Applicable at
 <a name="sql.format.hive.text.write.enabled"></a>spark.rapids.sql.format.hive.text.write.enabled|When set to false disables Hive text table write acceleration|false|Runtime
 <a name="sql.format.iceberg.enabled"></a>spark.rapids.sql.format.iceberg.enabled|When set to false disables all Iceberg acceleration|true|Runtime
 <a name="sql.format.iceberg.read.enabled"></a>spark.rapids.sql.format.iceberg.read.enabled|When set to false disables Iceberg input acceleration|true|Runtime
-<a name="sql.format.json.enabled"></a>spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|false|Runtime
-<a name="sql.format.json.read.enabled"></a>spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|false|Runtime
+<a name="sql.format.json.enabled"></a>spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|true|Runtime
+<a name="sql.format.json.read.enabled"></a>spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|true|Runtime
 <a name="sql.format.orc.enabled"></a>spark.rapids.sql.format.orc.enabled|When set to false disables all orc input and output acceleration|true|Runtime
 <a name="sql.format.orc.floatTypesToString.enable"></a>spark.rapids.sql.format.orc.floatTypesToString.enable|When reading an ORC file, the source data schemas(schemas of ORC file) may differ from the target schemas (schemas of the reader), we need to handle the castings from source type to target type. Since float/double numbers in GPU have different precision with CPU, when casting float/double to string, the result of GPU is different from result of CPU spark. Its default value is `true` (this means the strings result will differ from result of CPU). If it's set `false` explicitly and there exists casting from float/double to string in the job, then such behavior will cause an exception, and the job will fail.|true|Runtime
 <a name="sql.format.orc.multiThreadedRead.maxNumFilesParallel"></a>spark.rapids.sql.format.orc.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel. Used with MULTITHREADED reader, see spark.rapids.sql.format.orc.reader.type.|2147483647|Runtime
@@ -278,7 +278,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.IsNaN"></a>spark.rapids.sql.expression.IsNaN|`isnan`|Checks if a value is NaN|true|None|
 <a name="sql.expression.IsNotNull"></a>spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None|
 <a name="sql.expression.IsNull"></a>spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None|
-<a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case|
+<a name="sql.expression.JsonToStructs"></a>spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|true|None|
 <a name="sql.expression.JsonTuple"></a>spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.|
 <a name="sql.expression.KnownFloatingPointNormalized"></a>spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None|
 <a name="sql.expression.KnownNotNull"></a>spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None|
diff --git a/docs/compatibility.md b/docs/compatibility.md
index 1382b1a9a1f..0c745069032 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -316,125 +316,102 @@ case.
 
 ## JSON
 
-The JSON format read is an experimental feature which is expected to have some issues, so we disable
-it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and
-`spark.rapids.sql.format.json.read.enabled`.
+JSON, despite being a standard format, has some ambiguity in it. Spark also offers the ability to allow 
+some invalid JSON to be parsed. We have tried to provide JSON parsing that is compatible with 
+what Apache Spark does support. Note that Spark itself has changed through different releases, and we will
+try to call out which releases we offer different results for. JSON parsing is enabled by default
+except for date and timestamp types where we still have work to complete. If you wish to disable
+JSON Scan you can set `spark.rapids.sql.format.json.enabled` or
+`spark.rapids.sql.format.json.read.enabled` to false. To disable `from_json` you can set 
+`spark.rapids.sql.expression.JsonToStructs` to false.
 
-### Invalid JSON
+### Limits
 
-In Apache Spark on the CPU if a line in the JSON file is invalid the entire row is considered
-invalid and will result in nulls being returned for all columns. It is considered invalid if it
-violates the JSON specification, but with a few extensions.
+In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After 
+3.5.0 this was updated to be 1,000 by default. The current GPU implementation of JSON Scan and 
+`from_json` limits this to 254 no matter what version of Spark is used. If the nesting level is
+over this the JSON is considered invalid and all values will be returned as nulls.
+`get_json_object` and `json_tuple` have a maximum nesting depth of 64. An exception is thrown if
+the nesting depth goes over the maximum.
 
-  * Single quotes are allowed to quote strings and keys
-  * Unquoted values like NaN and Infinity can be parsed as floating point values
-  * Control characters do not need to be replaced with the corresponding escape sequences in a 
-    quoted string.
-  * Garbage at the end of a row, if there is valid JSON at the beginning of the row, is ignored.
+Spark 3.5.0 and above have limits on maximum string length 20,000,000 and maximum number length of
+1,000. We do not have any of these limits on the GPU.
 
-The GPU implementation does the same kinds of validations, but many of them are done on a per-column
-basis, which, for example, means if a number is formatted incorrectly, it is likely only that value
-will be considered invalid and return a null instead of nulls for the entire row.  
+We, like Spark, cannot support an JSON string that is larger than 2 GiB is size.
 
-There are options that can be used to enable and disable many of these features which are mostly
-listed below.
+### JSON Validation
 
-### JSON options
+Spark supports the option `allowNonNumericNumbers`. Versions of Spark prior to 3.3.0 where inconsistent between
+quoted and non-quoted values ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The
+GPU implementation is consistent with 3.3.0 and above.
 
-Spark supports passing options to the JSON parser when reading a dataset.  In most cases if the RAPIDS Accelerator
-sees one of these options that it does not support it will fall back to the CPU. In some cases we do not. The
-following options are documented below.
+### JSON Floating Point Types
 
-- `allowNumericLeadingZeros`  - Allows leading zeros in numbers (e.g. 00012). By default this is set to false.
-  When it is false Spark considers the JSON invalid if it encounters this type of number. The RAPIDS
-  Accelerator supports validating columns that are returned to the user with this option on or off. 
-
-- `allowUnquotedControlChars` - Allows JSON Strings to contain unquoted control characters (ASCII characters with
-  value less than 32, including tab and line feed characters) or not. By default this is set to false. If the schema
-  is provided while reading JSON file, then this flag has no impact on the RAPIDS Accelerator as it always allows
-  unquoted control characters but Spark sees these are invalid are returns nulls. However, if the schema is not provided
-  and this option is false, then RAPIDS Accelerator's behavior is same as Spark where an exception is thrown
-  as discussed in `JSON Schema discovery` section.
-
-- `allowNonNumericNumbers` - Allows `NaN` and `Infinity` values to be parsed (note that these are not valid numeric
-  values in the [JSON specification](https://json.org)). Spark versions prior to 3.3.0 have inconsistent behavior and will
-  parse some variants of `NaN` and `Infinity` even when this option is disabled
-  ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The RAPIDS Accelerator behavior is consistent with
-  Spark version 3.3.0 and later.
-
-### Nesting
-In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be.  After 
-3.5.0 this was updated to be 1000 by default. The current GPU implementation limits this to 254 
-no matter what version of Spark is used. If the nesting level is over this the JSON is considered
-invalid and all values will be returned as nulls.
-
-Mixed types can have some problems. If an item being read could have some lines that are arrays 
-and others that are structs/dictionaries it is possible an error will be thrown.
-
-Dates and Timestamps have some issues and may return values for technically invalid inputs.
-
-Floating point numbers have issues generally like with the rest of Spark, and we can parse them into
-a valid floating point number, but it might not match 100% with the way Spark does it.
-
-Strings are supported, but the data returned might not be normalized in the same way as the CPU
-implementation. Generally this comes down to the GPU not modifying the input, whereas Spark will
-do things like remove extra white space and parse numbers before turning them back into a string.
+Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float).
 
-### JSON Floating Point
+### JSON Integral Types
 
-Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float).
+Versions of Spark prior to 3.3.0 would parse quoted integer values, like "1". But 3.3.0 and above consider
+these to be invalid and will return `null` when parsed as an Integral types. The GPU implementation
+follows 3.3.0 and above.
 
-Prior to Spark 3.3.0, reading JSON strings such as `"+Infinity"` when specifying that the data type is `FloatType`
-or `DoubleType` caused these values to be parsed even when `allowNonNumericNumbers` is set to false. Also, Spark
-versions prior to 3.3.0 only supported the `"Infinity"` and `"-Infinity"` representations of infinity and did not
-support `"+INF"`, `"-INF"`, or `"+Infinity"`, which Spark considers valid when unquoted. The GPU JSON reader is
-consistent with the behavior in Spark 3.3.0 and later.
+### JSON Decimal Types
 
-Another limitation of the GPU JSON reader is that it will parse strings containing non-string boolean or numeric values where
-Spark will treat them as invalid inputs and will just return `null`.
+Spark supports parsing decimal types either formatted as floating point number or integral numbers, even if it is
+in a quoted string. If it is in a quoted string the local of the JVM is used to determine the number format.
+If the local is not for the `US`, which is the default we will fall back to the CPU because we do not currently
+parse those numbers correctly. The `US` format removes all commas ',' from the quoted string.
+As a part of this, though, non-arabic numbers are also supported. We do not support parsing these numbers
+see (issue 10532)[https://github.com/NVIDIA/spark-rapids/issues/10532].
 
-### JSON Dates/Timestamps
+### JSON Date/Timestamp Types 
 
 Dates and timestamps are not supported by default in JSON parser, since the GPU implementation is not 100%
 compatible with Apache Spark.
 If needed, they can be turned on through the config `spark.rapids.sql.json.read.datetime.enabled`.
-Once enabled, the JSON parser still does not support the `TimestampNTZ` type and will fall back to CPU
-if `spark.sql.timestampType` is set to `TIMESTAMP_NTZ` or if an explicit schema is provided that
-contains the `TimestampNTZ` type.
+This config works for both JSON scan and `from_json`. Once enabled, the JSON parser still does
+not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is set
+to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type.
 
 There is currently no support for reading numeric values as timestamps and null values are returned instead
-([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast
-to timestamp.
+([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast to timestamp.
 
-### JSON Schema discovery
+### JSON Arrays and Structs with Overflowing Numbers
 
-Spark SQL can automatically infer the schema of a JSON dataset if schema is not provided explicitly. The CPU
-handles schema discovery and there is no GPU acceleration of this. By default Spark will read/parse the entire
-dataset to determine the schema. This means that some options/errors which are ignored by the GPU may still
-result in an exception if used with schema discovery.
+Spark is inconsistent between versions in how it handles numbers that overflow that are nested in either an array
+or a non-top-level struct. In some versions only the value that overflowed is marked as null. In other versions the
+wrapping array or struct is marked as null. We currently only mark the individual value as null. This matches 
+versions 3.4.2 and above of Spark for structs. Arrays on most versions of spark invalidate the entire array if there
+is a single value that overflows within it.
 
-### `from_json` function
+### Duplicate Struct Names
 
-`JsonToStructs` of `from_json` is based on the same code as reading a JSON lines file.  There are
-a few differences with it.
+The JSON specification technically allows for duplicate keys in a struct, but does not explain what to 
+do with them. In the case of Spark it is inconsistent between operators which value wins. `get_json_object`
+depends on the query being performed. We do not always match what Spark does. We do match it in many cases,
+but we consider this enough of a corner case that we have not tried to make it work in all cases.
 
-The `from_json` function is disabled by default because it is experimental and has some known
-incompatibilities with Spark, and can be enabled by setting 
-`spark.rapids.sql.expression.JsonToStructs=true`. You don't need to set 
-`spark.rapids.sql.format.json.enabled` and`spark.rapids.sql.format.json.read.enabled` to true.
-In addition, if the input schema contains date and/or timestamp types, an additional config 
-`spark.rapids.sql.json.read.datetime.enabled` also needs to be set to `true` in order 
-to enable this function on the GPU.
+We also do not support schemas where there are duplicate column names. We just fall back to the CPU for those cases.
 
-There is no schema discovery as a schema is required as input to `from_json`
+### JSON Normalization (String Types)
 
-In addition to `structs`, a top level `map` type is supported, but only if the key and value are
-strings.
+In versions of Spark prior to 4.0.0 input JSON Strings were parsed to JSON tokens and then converted back to
+strings. This effectively normalizes the output string. So things like single quotes are transformed into double
+quotes, floating point numbers are parsed and converted back to strings possibly changing the format, and
+escaped characters are converted back to their simplest form. We try to support this on the GPU as well. Single quotes
+will be converted to double quotes. Only `get_json_object` and `json_tuple` attempt to normalize floating point
+numbers. There is no implementation on the GPU right now that tries to normalize escape characters.
+
+### `from_json` Function
+
+`JsonToStructs` or `from_json` is based on the same code as reading a JSON lines file.  There are
+a few differences with it.
 
-### `to_json` function
+The main difference is that `from_json` supports parsing Maps and Arrays directly from a JSON column, whereas
+JSON Scan only supports parsing top level structs. The GPU implementation of `from_json` has support for parsing
+a `MAP<STRING,STRING>` as a top level schema, but does not currently support arrays at the top level.
 
-The `to_json` function is disabled by default because it is experimental and has some known incompatibilities 
-with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsToJson=true`.
+### `to_json` Function
 
 Known issues are:
 
@@ -442,7 +419,7 @@ Known issues are:
   produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`.
 - Not all JSON options are respected
 
-### get_json_object
+### `get_json_object` Function
 
 Known issue:
 - [Floating-point number normalization error](https://github.com/NVIDIA/spark-rapids-jni/issues/1922). `get_json_object` floating-point number normalization on the GPU could sometimes return incorrect results if the string contains high-precision values, see the String to Float and Float to String section for more details.
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
index 2fa11f8aa6e..acf7133af40 100644
--- a/docs/supported_ops.md
+++ b/docs/supported_ops.md
@@ -9279,7 +9279,7 @@ are limited.
 <td rowSpan="2">JsonToStructs</td>
 <td rowSpan="2">`from_json`</td>
 <td rowSpan="2">Returns a struct value with the given `jsonStr` and `schema`</td>
-<td rowSpan="2">This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case</td>
+<td rowSpan="2">None</td>
 <td rowSpan="2">project</td>
 <td>jsonStr</td>
 <td> </td>
@@ -9320,7 +9320,7 @@ are limited.
 <td> </td>
 <td> </td>
 <td><b>NS</b></td>
-<td><em>PS<br/>MAP only supports keys and values that are of STRING type;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH</em></td>
+<td><em>PS<br/>MAP only supports keys and values that are of STRING type and is only supported at the top level;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH</em></td>
 <td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH</em></td>
 <td> </td>
 <td> </td>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index bdeebaabbfc..45905f0b9e0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3780,7 +3780,8 @@ object GpuOverrides extends Logging {
       ExprChecks.projectOnly(
         TypeSig.STRUCT.nested(jsonStructReadTypes) +
           TypeSig.MAP.nested(TypeSig.STRING).withPsNote(TypeEnum.MAP,
-          "MAP only supports keys and values that are of STRING type"),
+          "MAP only supports keys and values that are of STRING type " +
+            "and is only supported at the top level"),
         (TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY).nested(TypeSig.all),
         Seq(ParamCheck("jsonStr", TypeSig.STRING, TypeSig.STRING))),
       (a, conf, p, r) => new UnaryExprMeta[JsonToStructs](a, conf, p, r) {
@@ -3821,10 +3822,7 @@ object GpuOverrides extends Logging {
         override def convertToGpu(child: Expression): GpuExpression =
           // GPU implementation currently does not support duplicated json key names in input
           GpuJsonToStructs(a.schema, a.options, child, a.timeZoneId)
-      }).disabledByDefault("it is currently in beta and undergoes continuous enhancements."+
-      " Please consult the "+
-      "[compatibility documentation](../compatibility.md#json-supporting-types)"+
-      " to determine whether you can enable this configuration for your use case"),
+      }),
     expr[StructsToJson](
       "Converts structs to JSON text format",
       ExprChecks.projectOnly(
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index e279385be82..e22b8f53497 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1348,12 +1348,12 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .doc("When set to true enables all json input and output acceleration. " +
       "(only input is currently supported anyways)")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val ENABLE_JSON_READ = conf("spark.rapids.sql.format.json.read.enabled")
     .doc("When set to true enables json input acceleration")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val ENABLE_READ_JSON_FLOATS = conf("spark.rapids.sql.json.read.float.enabled")
     .doc("JSON reading is not 100% compatible when reading floats.")
diff --git a/tools/generated_files/320/supportedDataSource.csv b/tools/generated_files/320/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/320/supportedDataSource.csv
+++ b/tools/generated_files/320/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/320/supportedExprs.csv b/tools/generated_files/320/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/320/supportedExprs.csv
+++ b/tools/generated_files/320/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/321/supportedDataSource.csv b/tools/generated_files/321/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/321/supportedDataSource.csv
+++ b/tools/generated_files/321/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/321/supportedExprs.csv b/tools/generated_files/321/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/321/supportedExprs.csv
+++ b/tools/generated_files/321/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/321cdh/supportedDataSource.csv b/tools/generated_files/321cdh/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/321cdh/supportedDataSource.csv
+++ b/tools/generated_files/321cdh/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/321cdh/supportedExprs.csv b/tools/generated_files/321cdh/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/321cdh/supportedExprs.csv
+++ b/tools/generated_files/321cdh/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/322/supportedDataSource.csv b/tools/generated_files/322/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/322/supportedDataSource.csv
+++ b/tools/generated_files/322/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/322/supportedExprs.csv b/tools/generated_files/322/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/322/supportedExprs.csv
+++ b/tools/generated_files/322/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/323/supportedDataSource.csv b/tools/generated_files/323/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/323/supportedDataSource.csv
+++ b/tools/generated_files/323/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/323/supportedExprs.csv b/tools/generated_files/323/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/323/supportedExprs.csv
+++ b/tools/generated_files/323/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/324/supportedDataSource.csv b/tools/generated_files/324/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/324/supportedDataSource.csv
+++ b/tools/generated_files/324/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/324/supportedExprs.csv b/tools/generated_files/324/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/324/supportedExprs.csv
+++ b/tools/generated_files/324/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/330/supportedDataSource.csv b/tools/generated_files/330/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/330/supportedDataSource.csv
+++ b/tools/generated_files/330/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/330/supportedExprs.csv b/tools/generated_files/330/supportedExprs.csv
index fcea9c8cb40..0073281cb32 100644
--- a/tools/generated_files/330/supportedExprs.csv
+++ b/tools/generated_files/330/supportedExprs.csv
@@ -297,8 +297,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/330cdh/supportedDataSource.csv b/tools/generated_files/330cdh/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/330cdh/supportedDataSource.csv
+++ b/tools/generated_files/330cdh/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/330cdh/supportedExprs.csv b/tools/generated_files/330cdh/supportedExprs.csv
index fcea9c8cb40..0073281cb32 100644
--- a/tools/generated_files/330cdh/supportedExprs.csv
+++ b/tools/generated_files/330cdh/supportedExprs.csv
@@ -297,8 +297,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/331/supportedDataSource.csv b/tools/generated_files/331/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/331/supportedDataSource.csv
+++ b/tools/generated_files/331/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/331/supportedExprs.csv b/tools/generated_files/331/supportedExprs.csv
index 4eccb898337..f62af4c9513 100644
--- a/tools/generated_files/331/supportedExprs.csv
+++ b/tools/generated_files/331/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/332/supportedDataSource.csv b/tools/generated_files/332/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/332/supportedDataSource.csv
+++ b/tools/generated_files/332/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/332/supportedExprs.csv b/tools/generated_files/332/supportedExprs.csv
index 4eccb898337..f62af4c9513 100644
--- a/tools/generated_files/332/supportedExprs.csv
+++ b/tools/generated_files/332/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/332cdh/supportedDataSource.csv b/tools/generated_files/332cdh/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/332cdh/supportedDataSource.csv
+++ b/tools/generated_files/332cdh/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/332cdh/supportedExprs.csv b/tools/generated_files/332cdh/supportedExprs.csv
index 4eccb898337..f62af4c9513 100644
--- a/tools/generated_files/332cdh/supportedExprs.csv
+++ b/tools/generated_files/332cdh/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/333/supportedDataSource.csv b/tools/generated_files/333/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/333/supportedDataSource.csv
+++ b/tools/generated_files/333/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/333/supportedExprs.csv b/tools/generated_files/333/supportedExprs.csv
index 4eccb898337..f62af4c9513 100644
--- a/tools/generated_files/333/supportedExprs.csv
+++ b/tools/generated_files/333/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/334/supportedDataSource.csv b/tools/generated_files/334/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/334/supportedDataSource.csv
+++ b/tools/generated_files/334/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/334/supportedExprs.csv b/tools/generated_files/334/supportedExprs.csv
index 4eccb898337..f62af4c9513 100644
--- a/tools/generated_files/334/supportedExprs.csv
+++ b/tools/generated_files/334/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/340/supportedDataSource.csv b/tools/generated_files/340/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/340/supportedDataSource.csv
+++ b/tools/generated_files/340/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/340/supportedExprs.csv b/tools/generated_files/340/supportedExprs.csv
index 80bc405b058..01a48b40249 100644
--- a/tools/generated_files/340/supportedExprs.csv
+++ b/tools/generated_files/340/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/341/supportedDataSource.csv b/tools/generated_files/341/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/341/supportedDataSource.csv
+++ b/tools/generated_files/341/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/341/supportedExprs.csv b/tools/generated_files/341/supportedExprs.csv
index 80bc405b058..01a48b40249 100644
--- a/tools/generated_files/341/supportedExprs.csv
+++ b/tools/generated_files/341/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/342/supportedDataSource.csv b/tools/generated_files/342/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/342/supportedDataSource.csv
+++ b/tools/generated_files/342/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/342/supportedExprs.csv b/tools/generated_files/342/supportedExprs.csv
index 80bc405b058..01a48b40249 100644
--- a/tools/generated_files/342/supportedExprs.csv
+++ b/tools/generated_files/342/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/343/supportedDataSource.csv b/tools/generated_files/343/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/343/supportedDataSource.csv
+++ b/tools/generated_files/343/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/343/supportedExprs.csv b/tools/generated_files/343/supportedExprs.csv
index 80bc405b058..01a48b40249 100644
--- a/tools/generated_files/343/supportedExprs.csv
+++ b/tools/generated_files/343/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/344/supportedDataSource.csv b/tools/generated_files/344/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/344/supportedDataSource.csv
+++ b/tools/generated_files/344/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/344/supportedExprs.csv b/tools/generated_files/344/supportedExprs.csv
index 80bc405b058..01a48b40249 100644
--- a/tools/generated_files/344/supportedExprs.csv
+++ b/tools/generated_files/344/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/350/supportedDataSource.csv b/tools/generated_files/350/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/350/supportedDataSource.csv
+++ b/tools/generated_files/350/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/350/supportedExprs.csv b/tools/generated_files/350/supportedExprs.csv
index f45289388fc..4cbfc7c1c27 100644
--- a/tools/generated_files/350/supportedExprs.csv
+++ b/tools/generated_files/350/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/351/supportedDataSource.csv b/tools/generated_files/351/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/351/supportedDataSource.csv
+++ b/tools/generated_files/351/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/351/supportedExprs.csv b/tools/generated_files/351/supportedExprs.csv
index f45289388fc..4cbfc7c1c27 100644
--- a/tools/generated_files/351/supportedExprs.csv
+++ b/tools/generated_files/351/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/352/supportedDataSource.csv b/tools/generated_files/352/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/352/supportedDataSource.csv
+++ b/tools/generated_files/352/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/352/supportedExprs.csv b/tools/generated_files/352/supportedExprs.csv
index f45289388fc..4cbfc7c1c27 100644
--- a/tools/generated_files/352/supportedExprs.csv
+++ b/tools/generated_files/352/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/353/supportedDataSource.csv b/tools/generated_files/353/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/353/supportedDataSource.csv
+++ b/tools/generated_files/353/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/353/supportedExprs.csv b/tools/generated_files/353/supportedExprs.csv
index f45289388fc..4cbfc7c1c27 100644
--- a/tools/generated_files/353/supportedExprs.csv
+++ b/tools/generated_files/353/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/400/supportedDataSource.csv b/tools/generated_files/400/supportedDataSource.csv
index 77f30cbe1de..82df521b39b 100644
--- a/tools/generated_files/400/supportedDataSource.csv
+++ b/tools/generated_files/400/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,S,S
diff --git a/tools/generated_files/400/supportedExprs.csv b/tools/generated_files/400/supportedExprs.csv
index 890f959eab5..4cfa1020889 100644
--- a/tools/generated_files/400/supportedExprs.csv
+++ b/tools/generated_files/400/supportedExprs.csv
@@ -299,8 +299,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,S,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
diff --git a/tools/generated_files/supportedDataSource.csv b/tools/generated_files/supportedDataSource.csv
index 2573406ec3b..2eae4ed00ce 100644
--- a/tools/generated_files/supportedDataSource.csv
+++ b/tools/generated_files/supportedDataSource.csv
@@ -6,7 +6,7 @@ Delta,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 HiveText,read,S,S,S,S,S,S,S,S,PS,S,S,NS,NS,NS,NS,NS,NS,NS,NS,NS
 HiveText,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Iceberg,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
-JSON,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO,CO
+JSON,read,S,S,S,S,S,S,S,PS,PS,S,S,NA,NS,NA,PS,NS,PS,NS,NA,NA
 ORC,read,S,S,S,S,S,S,S,S,PS,S,S,NA,NS,NA,PS,PS,PS,NS,NA,NA
 ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Parquet,read,S,S,S,S,S,S,S,S,PS,S,S,NA,S,NA,PS,PS,PS,NS,NA,NA
diff --git a/tools/generated_files/supportedExprs.csv b/tools/generated_files/supportedExprs.csv
index 808d8fb4df3..e4a4db760b0 100644
--- a/tools/generated_files/supportedExprs.csv
+++ b/tools/generated_files/supportedExprs.csv
@@ -288,8 +288,8 @@ IsNotNull,S,`isnotnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,P
 IsNotNull,S,`isnotnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 IsNull,S,`isnull`,None,project,input,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 IsNull,S,`isnull`,None,project,result,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
-JsonToStructs,NS,`from_json`,This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,jsonStr,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
+JsonToStructs,S,`from_json`,None,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NS,PS,PS,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,json,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,field,NA,NA,NA,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 JsonTuple,NS,`json_tuple`,This is disabled by default because Experimental feature that could be unstable or have performance issues.,project,result,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA

From 938db2123f1bd7397d6270de0e2910cd94823098 Mon Sep 17 00:00:00 2001
From: MithunR <mithunr@nvidia.com>
Date: Mon, 25 Nov 2024 12:07:11 -0800
Subject: [PATCH 03/37] Fix aqe_test failures on [databricks] 14.3. (#11750)

* Fix aqe_test failures on [databricks] 14.3.

Fixes #11643.

This commit fixes the AQE/DPP tests that were reported in #11643 to
be failing on Databricks 14.3.

This is the result of a deficient shim for GpuSubqueryBroadcastMeta
being active for Databricks 14.3.  The deficient shim errantly
extended the non-Databricks base shim.

This commit moves the commonality in Databricks shims to a common
base class that is then customized for the changes in Databricks 14.3.

Signed-off-by: MithunR <mithunr@nvidia.com>
---
 integration_tests/src/main/python/aqe_test.py |   6 +-
 .../execution/GpuSubqueryBroadcastExec.scala  |   6 +-
 .../execution/GpuSubqueryBroadcastMeta.scala  | 102 ++-------------
 .../GpuSubqueryBroadcastMeta330DBBase.scala   | 121 ++++++++++++++++++
 .../execution/GpuSubqueryBroadcastMeta.scala  |   3 +-
 .../execution/GpuSubqueryBroadcastMeta.scala  |  35 +++++
 6 files changed, 171 insertions(+), 102 deletions(-)
 create mode 100644 sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala
 create mode 100644 sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala

diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py
index f9dddfae038..5b3b04efdfb 100755
--- a/integration_tests/src/main/python/aqe_test.py
+++ b/integration_tests/src/main/python/aqe_test.py
@@ -338,10 +338,10 @@ def do_it(spark):
 
 # this should be fixed by https://github.com/NVIDIA/spark-rapids/issues/11120
 aqe_join_with_dpp_fallback=["FilterExec"] if (is_databricks_runtime() or is_before_spark_330()) else []
+if is_databricks_version_or_later(14, 3):
+    aqe_join_with_dpp_fallback.append("CollectLimitExec")
 
 # Verify that DPP and AQE can coexist in even some odd cases involving multiple tables
-@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
-                    reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
 @ignore_order(local=True)
 @allow_non_gpu(*aqe_join_with_dpp_fallback)
 def test_aqe_join_with_dpp(spark_tmp_path):
@@ -395,8 +395,6 @@ def run_test(spark):
     assert_gpu_and_cpu_are_equal_collect(run_test, conf=_adaptive_conf)
 
 # Verify that DPP and AQE can coexist in even some odd cases involving 2 tables with multiple columns
-@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
-                    reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
 @ignore_order(local=True)
 @allow_non_gpu(*aqe_join_with_dpp_fallback)
 def test_aqe_join_with_dpp_multi_columns(spark_tmp_path):
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
index 72ed0e79504..e529e268f3f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastExec.scala
@@ -126,8 +126,10 @@ abstract class GpuSubqueryBroadcastMetaBase(
           } else {
             willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
           }
-        case _ =>
-          throw new AssertionError("should not reach here")
+
+        case unexpected =>
+          throw new AssertionError("Unexpected child exec in AdaptiveSparkPlan: " +
+            s"${unexpected.getClass.getName}")
       }
 
     case _ =>
diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala
index 76255b3e5a6..ae32800e77a 100644
--- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta.scala
@@ -21,105 +21,19 @@
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
-import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta, SparkPlanMeta}
+import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta}
 
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
-import org.apache.spark.sql.execution.{SparkPlan, SubqueryBroadcastExec}
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec}
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec}
-import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
+import org.apache.spark.sql.execution.SubqueryBroadcastExec
 
-class GpuSubqueryBroadcastMeta(
-    s: SubqueryBroadcastExec,
-    conf: RapidsConf,
-    p: Option[RapidsMeta[_, _, _]],
-    r: DataFromReplacementRule) extends
-    SparkPlanMeta[SubqueryBroadcastExec](s, conf, p, r) {
-  private var broadcastBuilder: () => SparkPlan = _
-
-  override val childExprs: Seq[BaseExprMeta[_]] = Nil
-
-  override val childPlans: Seq[SparkPlanMeta[SparkPlan]] = Nil
-
-  override def tagPlanForGpu(): Unit = s.child match {
-    // DPP: For AQE off, in this case, we handle DPP by converting the underlying
-    // BroadcastExchangeExec to GpuBroadcastExchangeExec.
-    // This is slightly different from the Apache Spark case, because Spark
-    // sends the underlying plan into the plugin in advance via the PlanSubqueries rule.
-    // Here, we have the full non-GPU subquery plan, so we convert the whole
-    // thing.
-    case ex @ BroadcastExchangeExec(_, child) =>
-      val exMeta = new GpuBroadcastMeta(ex.copy(child = child), conf, p, r)
-      exMeta.tagForGpu()
-      if (exMeta.canThisBeReplaced) {
-        broadcastBuilder = () => exMeta.convertToGpu()
-      } else {
-        willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-      }
-    // DPP: For AQE on, we have an almost completely different scenario then before,
-    // Databricks uses a BroadcastQueryStageExec and either:
-    //  1) provide an underlying BroadcastExchangeExec that we will have to convert
-    //     somehow
-    //  2) might already do the reuse work for us. The ReusedExchange is now a
-    //     part of the SubqueryBroadcast, so we send it back here as underlying the
-    //     GpuSubqueryBroadcastExchangeExec
-    case bqse: BroadcastQueryStageExec =>
-      bqse.plan match {
-        case ex: BroadcastExchangeExec =>
-          val exMeta = new GpuBroadcastMeta(ex, conf, p, r)
-          exMeta.tagForGpu()
-          if (exMeta.canThisBeReplaced) {
-            broadcastBuilder = () => exMeta.convertToGpu()
-          } else {
-            willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-          }
-        case reuse: ReusedExchangeExec =>
-          reuse.child match {
-            case _: GpuBroadcastExchangeExec =>
-              // A BroadcastExchange has already been replaced, so it can run on the GPU
-              broadcastBuilder = () => reuse
-            case _ =>
-              willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
-          }
-      }
-    case _ =>
-      willNotWorkOnGpu("the subquery to broadcast can not entirely run in the GPU.")
-  }
-  /**
-  * Simply returns the original plan. Because its only child, BroadcastExchange, doesn't
-  * need to change if SubqueryBroadcastExec falls back to the CPU.
-  */
-  override def convertToCpu(): SparkPlan = s
+class GpuSubqueryBroadcastMeta(s: SubqueryBroadcastExec,
+                               conf: RapidsConf,
+                               p: Option[RapidsMeta[_, _, _]],
+                               r: DataFromReplacementRule)
+  extends GpuSubqueryBroadcastMeta330DBBase(s, conf, p, r) {
 
   override def convertToGpu(): GpuExec = {
     GpuSubqueryBroadcastExec(s.name, Seq(s.index), s.buildKeys, broadcastBuilder())(
       getBroadcastModeKeyExprs)
   }
 
-  /** Extract the broadcast mode key expressions if there are any. */
-  private def getBroadcastModeKeyExprs: Option[Seq[Expression]] = {
-    val broadcastMode = s.child match {
-      case b: BroadcastExchangeExec =>
-        b.mode
-      case bqse: BroadcastQueryStageExec =>
-        bqse.plan match {
-          case b: BroadcastExchangeExec =>
-            b.mode
-          case reuse: ReusedExchangeExec =>
-            reuse.child match {
-              case g: GpuBroadcastExchangeExec =>
-                g.mode
-            }
-          case _ =>
-            throw new AssertionError("should not reach here")
-        }
-    }
-
-    broadcastMode match {
-      case HashedRelationBroadcastMode(keys, _) => Some(keys)
-      case IdentityBroadcastMode => None
-      case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m")
-    }
-  }
-}
+}
\ No newline at end of file
diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala
new file mode 100644
index 00000000000..a6248127bad
--- /dev/null
+++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/rapids/execution/GpuSubqueryBroadcastMeta330DBBase.scala
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "330db"}
+{"spark": "332db"}
+{"spark": "341db"}
+{"spark": "350db143"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids.execution
+
+import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, RapidsConf, RapidsMeta, SparkPlanMeta}
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.physical.IdentityBroadcastMode
+import org.apache.spark.sql.execution.{SparkPlan, SubqueryBroadcastExec}
+import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec}
+import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode
+
+abstract class GpuSubqueryBroadcastMeta330DBBase(s: SubqueryBroadcastExec,
+                                        conf: RapidsConf,
+                                        p: Option[RapidsMeta[_, _, _]],
+                                        r: DataFromReplacementRule) extends
+  SparkPlanMeta[SubqueryBroadcastExec](s, conf, p, r) {
+  protected var broadcastBuilder: () => SparkPlan = _
+
+  override val childExprs: Seq[BaseExprMeta[_]] = Nil
+
+  override val childPlans: Seq[SparkPlanMeta[SparkPlan]] = Nil
+
+  override def tagPlanForGpu(): Unit = s.child match {
+    // DPP: For AQE off, in this case, we handle DPP by converting the underlying
+    // BroadcastExchangeExec to GpuBroadcastExchangeExec.
+    // This is slightly different from the Apache Spark case, because Spark
+    // sends the underlying plan into the plugin in advance via the PlanSubqueries rule.
+    // Here, we have the full non-GPU subquery plan, so we convert the whole
+    // thing.
+    case ex @ BroadcastExchangeExec(_, child) =>
+      val exMeta = new GpuBroadcastMeta(ex.copy(child = child), conf, p, r)
+      exMeta.tagForGpu()
+      if (exMeta.canThisBeReplaced) {
+        broadcastBuilder = () => exMeta.convertToGpu()
+      } else {
+        willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
+      }
+    // DPP: For AQE on, we have an almost completely different scenario then before,
+    // Databricks uses a BroadcastQueryStageExec and either:
+    //  1) provide an underlying BroadcastExchangeExec that we will have to convert
+    //     somehow
+    //  2) might already do the reuse work for us. The ReusedExchange is now a
+    //     part of the SubqueryBroadcast, so we send it back here as underlying the
+    //     GpuSubqueryBroadcastExchangeExec
+    case bqse: BroadcastQueryStageExec =>
+      bqse.plan match {
+        case ex: BroadcastExchangeExec =>
+          val exMeta = new GpuBroadcastMeta(ex, conf, p, r)
+          exMeta.tagForGpu()
+          if (exMeta.canThisBeReplaced) {
+            broadcastBuilder = () => exMeta.convertToGpu()
+          } else {
+            willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
+          }
+        case reuse: ReusedExchangeExec =>
+          reuse.child match {
+            case _: GpuBroadcastExchangeExec =>
+              // A BroadcastExchange has already been replaced, so it can run on the GPU
+              broadcastBuilder = () => reuse
+            case _ =>
+              willNotWorkOnGpu("underlying BroadcastExchange can not run in the GPU.")
+          }
+      }
+    case _ =>
+      willNotWorkOnGpu("the subquery to broadcast can not entirely run in the GPU.")
+  }
+  /**
+   * Simply returns the original plan. Because its only child, BroadcastExchange, doesn't
+   * need to change if SubqueryBroadcastExec falls back to the CPU.
+   */
+  override def convertToCpu(): SparkPlan = s
+
+  /** Extract the broadcast mode key expressions if there are any. */
+  protected def getBroadcastModeKeyExprs: Option[Seq[Expression]] = {
+    val broadcastMode = s.child match {
+      case b: BroadcastExchangeExec =>
+        b.mode
+      case bqse: BroadcastQueryStageExec =>
+        bqse.plan match {
+          case b: BroadcastExchangeExec =>
+            b.mode
+          case reuse: ReusedExchangeExec =>
+            reuse.child match {
+              case g: GpuBroadcastExchangeExec =>
+                g.mode
+            }
+          case _ =>
+            throw new AssertionError("should not reach here")
+        }
+    }
+
+    broadcastMode match {
+      case HashedRelationBroadcastMode(keys, _) => Some(keys)
+      case IdentityBroadcastMode => None
+      case m => throw new UnsupportedOperationException(s"Unknown broadcast mode $m")
+    }
+  }
+}
+
diff --git a/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala
index 2f362531646..10e3fa68b76 100644
--- a/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala
+++ b/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala
@@ -15,7 +15,6 @@
  */
 /*** spark-rapids-shim-json-lines
 {"spark": "350db143"}
-{"spark": "400"}
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.execution
 
@@ -28,7 +27,7 @@ class GpuSubqueryBroadcastMeta(
     conf: RapidsConf,
     p: Option[RapidsMeta[_, _, _]],
     r: DataFromReplacementRule) extends
-    GpuSubqueryBroadcastMetaBase(s, conf, p, r) {
+    GpuSubqueryBroadcastMeta330DBBase(s, conf, p, r) {
   override def convertToGpu(): GpuExec = {
     GpuSubqueryBroadcastExec(s.name, s.indices, s.buildKeys, broadcastBuilder())(
       getBroadcastModeKeyExprs)
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala
new file mode 100644
index 00000000000..c16564f523e
--- /dev/null
+++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/GpuSubqueryBroadcastMeta.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*** spark-rapids-shim-json-lines
+{"spark": "400"}
+spark-rapids-shim-json-lines ***/
+package org.apache.spark.sql.rapids.execution
+
+import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuExec, RapidsConf, RapidsMeta}
+
+import org.apache.spark.sql.execution.SubqueryBroadcastExec
+
+class GpuSubqueryBroadcastMeta(
+    s: SubqueryBroadcastExec,
+    conf: RapidsConf,
+    p: Option[RapidsMeta[_, _, _]],
+    r: DataFromReplacementRule) extends
+    GpuSubqueryBroadcastMetaBase(s, conf, p, r) {
+  override def convertToGpu(): GpuExec = {
+    GpuSubqueryBroadcastExec(s.name, s.indices, s.buildKeys, broadcastBuilder())(
+      getBroadcastModeKeyExprs)
+  }
+}

From 6b90b2fffb9035921fab6cd105469645c09a7b4d Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Mon, 25 Nov 2024 14:55:44 -0800
Subject: [PATCH 04/37] Add support for asynchronous writing for parquet
 (#11730)

* Support async writing for query output

Signed-off-by: Jihoon Son <ghoonson@gmail.com>

* doc change

* use a long timeout

* fix test failure due to a race

* fix flaky test

* address comments

* fix the config name for hold gpu

* Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala

Simplify case arm

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* address comments

* missing doc change

* use trampoline

---------

Signed-off-by: Jihoon Son <ghoonson@gmail.com>
Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>
---
 .../spark/rapids/ColumnarOutputWriter.scala   |  36 +++-
 .../spark/rapids/GpuParquetFileFormat.scala   |  13 +-
 .../com/nvidia/spark/rapids/Plugin.scala      |   3 +
 .../com/nvidia/spark/rapids/RapidsConf.scala  |  35 ++++
 .../rapids/io/async/AsyncOutputStream.scala   | 186 ++++++++++++++++++
 .../rapids/io/async/ThrottlingExecutor.scala  |  43 ++++
 .../rapids/io/async/TrafficController.scala   | 142 +++++++++++++
 .../io/async/AsyncOutputStreamSuite.scala     | 162 +++++++++++++++
 .../io/async/ThrottlingExecutorSuite.scala    | 145 ++++++++++++++
 .../io/async/TrafficControllerSuite.scala     | 101 ++++++++++
 10 files changed, 855 insertions(+), 11 deletions(-)
 create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala
 create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
 create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
 create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala
 create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
 create mode 100644 sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
index 69157c046b6..df62683d346 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnarOutputWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,11 +25,13 @@ import com.nvidia.spark.Retryable
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry, withRetryNoSplit}
+import com.nvidia.spark.rapids.io.async.{AsyncOutputStream, TrafficController}
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FSDataOutputStream, Path}
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.rapids.{ColumnarWriteTaskStatsTracker, GpuWriteTaskStatsTracker}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -70,21 +72,31 @@ abstract class ColumnarOutputWriterFactory extends Serializable {
 abstract class ColumnarOutputWriter(context: TaskAttemptContext,
     dataSchema: StructType,
     rangeName: String,
-    includeRetry: Boolean) extends HostBufferConsumer {
+    includeRetry: Boolean,
+    holdGpuBetweenBatches: Boolean = false) extends HostBufferConsumer with Logging {
 
   protected val tableWriter: TableWriter
 
   protected val conf: Configuration = context.getConfiguration
 
-  // This is implemented as a method to make it easier to subclass
-  // ColumnarOutputWriter in the tests, and override this behavior.
-  protected def getOutputStream: FSDataOutputStream = {
+  private val trafficController: Option[TrafficController] = TrafficController.getInstance
+
+  private def openOutputStream(): OutputStream = {
     val hadoopPath = new Path(path)
     val fs = hadoopPath.getFileSystem(conf)
     fs.create(hadoopPath, false)
   }
 
-  protected val outputStream: FSDataOutputStream = getOutputStream
+  // This is implemented as a method to make it easier to subclass
+  // ColumnarOutputWriter in the tests, and override this behavior.
+  protected def getOutputStream: OutputStream = {
+    trafficController.map(controller => {
+      logWarning("Async output write enabled")
+      new AsyncOutputStream(() => openOutputStream(), controller)
+    }).getOrElse(openOutputStream())
+  }
+
+  protected val outputStream: OutputStream = getOutputStream
 
   private[this] val tempBuffer = new Array[Byte](128 * 1024)
   private[this] var anythingWritten = false
@@ -166,7 +178,11 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
     }
     // we successfully buffered to host memory, release the semaphore and write
     // the buffered data to the FS
-    GpuSemaphore.releaseIfNecessary(TaskContext.get)
+    if (!holdGpuBetweenBatches) {
+      logDebug("Releasing semaphore between batches")
+      GpuSemaphore.releaseIfNecessary(TaskContext.get)
+    }
+
     writeBufferedData()
     updateStatistics(writeStartTime, gpuTime, statsTrackers)
     spillableBatch.numRows()
@@ -202,6 +218,10 @@ abstract class ColumnarOutputWriter(context: TaskAttemptContext,
       // buffer an empty batch on close() to work around issues in cuDF
       // where corrupt files can be written if nothing is encoded via the writer.
       anythingWritten = true
+
+      // tableWriter.write() serializes the table into the HostMemoryBuffer, and buffers it
+      // by calling handleBuffer() on the ColumnarOutputWriter. It may not write to the
+      // output stream just yet.
       tableWriter.write(table)
     }
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
index 25105386b3d..2b5f246e56a 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
@@ -271,13 +271,19 @@ class GpuParquetFileFormat extends ColumnarFileFormat with Logging {
           s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE.")
     }
 
+    val asyncOutputWriteEnabled = RapidsConf.ENABLE_ASYNC_OUTPUT_WRITE.get(sqlConf)
+    // holdGpuBetweenBatches is on by default if asyncOutputWriteEnabled is on
+    val holdGpuBetweenBatches = RapidsConf.ASYNC_QUERY_OUTPUT_WRITE_HOLD_GPU_IN_TASK.get(sqlConf)
+      .getOrElse(asyncOutputWriteEnabled)
+
     new ColumnarOutputWriterFactory {
         override def newInstance(
           path: String,
           dataSchema: StructType,
           context: TaskAttemptContext): ColumnarOutputWriter = {
         new GpuParquetWriter(path, dataSchema, compressionType, outputTimestampType.toString,
-          dateTimeRebaseMode, timestampRebaseMode, context, parquetFieldIdWriteEnabled)
+          dateTimeRebaseMode, timestampRebaseMode, context, parquetFieldIdWriteEnabled,
+          holdGpuBetweenBatches)
       }
 
       override def getFileExtension(context: TaskAttemptContext): String = {
@@ -299,8 +305,9 @@ class GpuParquetWriter(
     dateRebaseMode: DateTimeRebaseMode,
     timestampRebaseMode: DateTimeRebaseMode,
     context: TaskAttemptContext,
-    parquetFieldIdEnabled: Boolean)
-  extends ColumnarOutputWriter(context, dataSchema, "Parquet", true) {
+    parquetFieldIdEnabled: Boolean,
+    holdGpuBetweenBatches: Boolean)
+  extends ColumnarOutputWriter(context, dataSchema, "Parquet", true, holdGpuBetweenBatches) {
   override def throwIfRebaseNeededInExceptionMode(batch: ColumnarBatch): Unit = {
     val cols = GpuColumnVector.extractBases(batch)
     cols.foreach { col =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index e20b21da520..5127c7899a8 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -31,6 +31,7 @@ import com.nvidia.spark.DFUDFPlugin
 import com.nvidia.spark.rapids.RapidsConf.AllowMultipleJars
 import com.nvidia.spark.rapids.RapidsPluginUtils.buildInfoEvent
 import com.nvidia.spark.rapids.filecache.{FileCache, FileCacheLocalityManager, FileCacheLocalityMsg}
+import com.nvidia.spark.rapids.io.async.TrafficController
 import com.nvidia.spark.rapids.jni.GpuTimeZoneDB
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import org.apache.commons.lang3.exception.ExceptionUtils
@@ -554,6 +555,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       extraExecutorPlugins.foreach(_.init(pluginContext, extraConf))
       GpuSemaphore.initialize()
       FileCache.init(pluginContext)
+      TrafficController.initialize(conf)
     } catch {
       // Exceptions in executor plugin can cause a single thread to die but the executor process
       // sticks around without any useful info until it hearbeat times out. Print what happened
@@ -656,6 +658,7 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
     extraExecutorPlugins.foreach(_.shutdown())
     FileCache.shutdown()
     GpuCoreDumpHandler.shutdown()
+    TrafficController.shutdown()
   }
 
   override def onTaskFailed(failureReason: TaskFailedReason): Unit = {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index e22b8f53497..ab7a788d205 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -2406,6 +2406,36 @@ val SHUFFLE_COMPRESSION_LZ4_CHUNK_SIZE = conf("spark.rapids.shuffle.compression.
     .booleanConf
     .createWithDefault(false)
 
+  val ENABLE_ASYNC_OUTPUT_WRITE =
+    conf("spark.rapids.sql.asyncWrite.queryOutput.enabled")
+      .doc("Option to turn on the async query output write. During the final output write, the " +
+        "task first copies the output to the host memory, and then writes it into the storage. " +
+        "When this option is enabled, the task will asynchronously write the output in the host " +
+        "memory to the storage. Only the Parquet format is supported currently.")
+      .internal()
+      .booleanConf
+      .createWithDefault(false)
+
+  val ASYNC_QUERY_OUTPUT_WRITE_HOLD_GPU_IN_TASK =
+    conf("spark.rapids.sql.queryOutput.holdGpuInTask")
+      .doc("Option to hold GPU semaphore between batch processing during the final output write. " +
+        "This option could degrade query performance if it is enabled without the async query " +
+        "output write. It is recommended to consider enabling this option only when " +
+        s"${ENABLE_ASYNC_OUTPUT_WRITE.key} is set. This option is off by default when the async " +
+        "query output write is disabled; otherwise, it is on.")
+      .internal()
+      .booleanConf
+      .createOptional
+
+  val ASYNC_WRITE_MAX_IN_FLIGHT_HOST_MEMORY_BYTES =
+    conf("spark.rapids.sql.asyncWrite.maxInFlightHostMemoryBytes")
+      .doc("Maximum number of host memory bytes per executor that can be in-flight for async " +
+        "query output write. Tasks may be blocked if the total host memory bytes in-flight " +
+        "exceeds this value.")
+      .internal()
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefault(2L * 1024 * 1024 * 1024)
+
   private def printSectionHeader(category: String): Unit =
     println(s"\n### $category")
 
@@ -2663,6 +2693,9 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val isFoldableNonLitAllowed: Boolean = get(FOLDABLE_NON_LIT_ALLOWED)
 
+  lazy val asyncWriteMaxInFlightHostMemoryBytes: Long =
+    get(ASYNC_WRITE_MAX_IN_FLIGHT_HOST_MEMORY_BYTES)
+
   /**
    * Convert a string value to the injection configuration OomInjection.
    *
@@ -3248,6 +3281,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val caseWhenFuseEnabled: Boolean = get(CASE_WHEN_FUSE)
 
+  lazy val isAsyncOutputWriteEnabled: Boolean = get(ENABLE_ASYNC_OUTPUT_WRITE)
+
   private val optimizerDefaults = Map(
     // this is not accurate because CPU projections do have a cost due to appending values
     // to each row that is produced, but this needs to be a really small number because
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala
new file mode 100644
index 00000000000..40904a96dd2
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStream.scala
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.io.{IOException, OutputStream}
+import java.util.concurrent.{Callable, TimeUnit}
+import java.util.concurrent.atomic.{AtomicLong, AtomicReference}
+
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+
+import org.apache.spark.sql.rapids.execution.TrampolineUtil
+
+/**
+ * OutputStream that performs writes asynchronously. Writes are scheduled on a background thread
+ * and executed in the order they were scheduled. This class is not thread-safe and should only be
+ * used by a single thread.
+ */
+class AsyncOutputStream(openFn: Callable[OutputStream], trafficController: TrafficController)
+  extends OutputStream {
+
+  private var closed = false
+
+  private val executor = new ThrottlingExecutor(
+    TrampolineUtil.newDaemonCachedThreadPool("AsyncOutputStream", 1, 1),
+    trafficController)
+
+  // Open the underlying stream asynchronously as soon as the AsyncOutputStream is constructed,
+  // so that the open can be done in parallel with other operations. This could help with
+  // performance if the open is slow.
+  private val openFuture = executor.submit(openFn, 0)
+  // Let's give it enough time to open the stream. Something bad should have happened if it
+  // takes more than 5 minutes to open a stream.
+  private val openTimeoutMin = 5
+
+  private lazy val delegate: OutputStream = {
+    openFuture.get(openTimeoutMin, TimeUnit.MINUTES)
+  }
+
+  class Metrics {
+    var numBytesScheduled: Long = 0
+    // This is thread-safe as it is updated by the background thread and can be read by
+    // any threads.
+    val numBytesWritten: AtomicLong = new AtomicLong(0)
+  }
+
+  val metrics = new Metrics
+
+  /**
+   * The last error that occurred in the background thread, or None if no error occurred.
+   * Once it is set, all subsequent writes that are already scheduled will fail and no new
+   * writes will be accepted.
+   *
+   * This is thread-safe as it is set by the background thread and can be read by any threads.
+   */
+  val lastError: AtomicReference[Option[Throwable]] =
+    new AtomicReference[Option[Throwable]](None)
+
+  @throws[IOException]
+  private def throwIfError(): Unit = {
+    lastError.get() match {
+      case Some(t: IOException) => throw t
+      case Some(t) => throw new IOException(t)
+      case None =>
+    }
+  }
+
+  @throws[IOException]
+  private def ensureOpen(): Unit = {
+    if (closed) {
+      throw new IOException("Stream closed")
+    }
+  }
+
+  private def scheduleWrite(fn: () => Unit, bytesToWrite: Int): Unit = {
+    throwIfError()
+    ensureOpen()
+
+    metrics.numBytesScheduled += bytesToWrite
+    executor.submit(() => {
+      throwIfError()
+      ensureOpen()
+
+      try {
+        fn()
+        metrics.numBytesWritten.addAndGet(bytesToWrite)
+      } catch {
+        case t: Throwable =>
+          // Update the error state
+          lastError.set(Some(t))
+      }
+    }, bytesToWrite)
+  }
+
+  override def write(b: Int): Unit = {
+    scheduleWrite(() => delegate.write(b), 1)
+  }
+
+  override def write(b: Array[Byte]): Unit = {
+    scheduleWrite(() => delegate.write(b), b.length)
+  }
+
+  /**
+   * Schedules a write of the given bytes to the underlying stream. The write is executed
+   * asynchronously on a background thread. The method returns immediately, and the write may not
+   * have completed when the method returns.
+   *
+   * If an error has occurred in the background thread and [[lastError]] has been set, this function
+   * will throw an IOException immediately.
+   *
+   * If an error has occurred in the background thread while executing a previous write after the
+   * current write has been scheduled, the current write will fail with the same error.
+   */
+  @throws[IOException]
+  override def write(b: Array[Byte], off: Int, len: Int): Unit = {
+    scheduleWrite(() => delegate.write(b, off, len), len)
+  }
+
+  /**
+   * Flushes all pending writes to the underlying stream. This method blocks until all pending
+   * writes have been completed. If an error has occurred in the background thread, this method
+   * will throw an IOException.
+   *
+   * If an error has occurred in the background thread and [[lastError]] has been set, this function
+   * will throw an IOException immediately.
+   *
+   * If an error has occurred in the background thread while executing a previous task after the
+   * current flush has been scheduled, the current flush will fail with the same error.
+   */
+  @throws[IOException]
+  override def flush(): Unit = {
+    throwIfError()
+    ensureOpen()
+
+    val f = executor.submit(() => {
+      throwIfError()
+      ensureOpen()
+
+      delegate.flush()
+    }, 0)
+
+    f.get()
+  }
+
+  /**
+   * Closes the underlying stream and releases any resources associated with it. All pending writes
+   * are flushed before closing the stream. This method blocks until all pending writes have been
+   * completed.
+   *
+   * If an error has occurred while flushing, this function will throw an IOException.
+   *
+   * If an error has occurred while executing a previous task before this function is called,
+   * this function will throw the same error. All resources and the underlying stream are still
+   * guaranteed to be closed.
+   */
+  @throws[IOException]
+  override def close(): Unit = {
+    if (!closed) {
+      Seq[AutoCloseable](
+        () => {
+          // Wait for all pending writes to complete
+          // This will throw an exception if one of the writes fails
+          flush()
+        },
+        () => {
+          // Give the executor a chance to shutdown gracefully.
+          executor.shutdownNow(10, TimeUnit.SECONDS)
+        },
+        delegate,
+        () => closed = true).safeClose()
+    }
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
new file mode 100644
index 00000000000..45889bf89ac
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.util.concurrent.{Callable, ExecutorService, Future, TimeUnit}
+
+/**
+ * Thin wrapper around an ExecutorService that adds throttling.
+ */
+class ThrottlingExecutor(
+    val executor: ExecutorService, throttler: TrafficController) {
+
+  def submit[T](callable: Callable[T], hostMemoryBytes: Long): Future[T] = {
+    val task = new Task[T](hostMemoryBytes, callable)
+    throttler.blockUntilRunnable(task)
+    executor.submit(() => {
+      try {
+        task.call()
+      } finally {
+        throttler.taskCompleted(task)
+      }
+    })
+  }
+
+  def shutdownNow(timeout: Long, timeUnit: TimeUnit): Unit = {
+    executor.shutdownNow()
+    executor.awaitTermination(timeout, timeUnit)
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
new file mode 100644
index 00000000000..0110f2d89ca
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.util.concurrent.Callable
+import javax.annotation.concurrent.GuardedBy
+
+import com.nvidia.spark.rapids.RapidsConf
+
+/**
+ * Simple wrapper around a [[Callable]] that also keeps track of the host memory bytes used by
+ * the task.
+ *
+ * Note: we may want to add more metadata to the task in the future, such as the device memory,
+ * as we implement more throttling strategies.
+ */
+class Task[T](val hostMemoryBytes: Long, callable: Callable[T]) extends Callable[T] {
+  override def call(): T = callable.call()
+}
+
+/**
+ * Throttle interface to be implemented by different throttling strategies.
+ *
+ * Currently, only HostMemoryThrottle is implemented, which limits the maximum in-flight host
+ * memory bytes. In the future, we can add more throttling strategies, such as limiting the
+ * device memory usage, the number of tasks, etc.
+ */
+trait Throttle {
+
+  /**
+   * Returns true if the task can be accepted, false otherwise.
+   * TrafficController will block the task from being scheduled until this method returns true.
+   */
+  def canAccept[T](task: Task[T]): Boolean
+
+  /**
+   * Callback to be called when a task is scheduled.
+   */
+  def taskScheduled[T](task: Task[T]): Unit
+
+  /**
+   * Callback to be called when a task is completed, either successfully or with an exception.
+   */
+  def taskCompleted[T](task: Task[T]): Unit
+}
+
+/**
+ * Throttle implementation that limits the total host memory used by the in-flight tasks.
+ */
+class HostMemoryThrottle(val maxInFlightHostMemoryBytes: Long) extends Throttle {
+  private var totalHostMemoryBytes: Long = 0
+
+  override def canAccept[T](task: Task[T]): Boolean = {
+    totalHostMemoryBytes + task.hostMemoryBytes <= maxInFlightHostMemoryBytes
+  }
+
+  override def taskScheduled[T](task: Task[T]): Unit = {
+    totalHostMemoryBytes += task.hostMemoryBytes
+  }
+
+  override def taskCompleted[T](task: Task[T]): Unit = {
+    totalHostMemoryBytes -= task.hostMemoryBytes
+  }
+
+  def getTotalHostMemoryBytes: Long = totalHostMemoryBytes
+}
+
+/**
+ * TrafficController is responsible for blocking tasks from being scheduled when the throttle
+ * is exceeded. It also keeps track of the number of tasks that are currently scheduled.
+ *
+ * This class is thread-safe as it is used by multiple tasks.
+ */
+class TrafficController protected[rapids] (throttle: Throttle) {
+
+  @GuardedBy("this")
+  private var numTasks: Int = 0
+
+  /**
+   * Blocks the task from being scheduled until the throttle allows it. If there is no task
+   * currently scheduled, the task is scheduled immediately even if the throttle is exceeded.
+   */
+  def blockUntilRunnable[T](task: Task[T]): Unit = synchronized {
+    if (numTasks > 0) {
+      while (!throttle.canAccept(task)) {
+        wait(100)
+      }
+    }
+    numTasks += 1
+    throttle.taskScheduled(task)
+  }
+
+  def taskCompleted[T](task: Task[T]): Unit = synchronized {
+    numTasks -= 1
+    throttle.taskCompleted(task)
+    notify()
+  }
+
+  def numScheduledTasks: Int = synchronized {
+    numTasks
+  }
+}
+
+object TrafficController {
+
+  private var instance: TrafficController = _
+
+  /**
+   * Initializes the TrafficController singleton instance.
+   * This is called once per executor.
+   */
+  def initialize(conf: RapidsConf): Unit = synchronized {
+    if (conf.isAsyncOutputWriteEnabled && instance == null) {
+      instance = new TrafficController(
+        new HostMemoryThrottle(conf.asyncWriteMaxInFlightHostMemoryBytes))
+    }
+  }
+
+  def getInstance: Option[TrafficController] = synchronized {
+    Option(instance)
+  }
+
+  def shutdown(): Unit = synchronized {
+    if (instance != null) {
+      instance = null
+    }
+  }
+}
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala
new file mode 100644
index 00000000000..a4fa35349ce
--- /dev/null
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/AsyncOutputStreamSuite.scala
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.io.{BufferedOutputStream, File, FileOutputStream, IOException, OutputStream}
+import java.util.concurrent.Callable
+
+import com.nvidia.spark.rapids.Arm.withResource
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.funsuite.AnyFunSuite
+
+class AsyncOutputStreamSuite extends AnyFunSuite with BeforeAndAfterEach {
+
+  private val bufLen = 128 * 1024
+  private val buf: Array[Byte] = new Array[Byte](bufLen)
+  private val maxBufCount = 10
+  private val trafficController = new TrafficController(
+    new HostMemoryThrottle(bufLen * maxBufCount))
+
+  def openStream(): AsyncOutputStream = {
+    new AsyncOutputStream(() => {
+      val file = File.createTempFile("async-write-test", "tmp")
+      new BufferedOutputStream(new FileOutputStream(file))
+    }, trafficController)
+  }
+
+  test("open, write, and close") {
+    val numBufs = 1000
+    val stream = openStream()
+    withResource(stream) { os =>
+      for (_ <- 0 until numBufs) {
+        os.write(buf)
+      }
+    }
+    assertResult(bufLen * numBufs)(stream.metrics.numBytesScheduled)
+    assertResult(bufLen * numBufs)(stream.metrics.numBytesWritten.get())
+  }
+
+  test("write after closed") {
+    val os = openStream()
+    os.close()
+    assertThrows[IOException] {
+      os.write(buf)
+    }
+  }
+
+  test("flush after closed") {
+    val os = openStream()
+    os.close()
+    assertThrows[IOException] {
+      os.flush()
+    }
+  }
+
+  class ThrowingOutputStream extends OutputStream {
+
+    var failureCount = 0
+
+    override def write(i: Int): Unit = {
+      failureCount += 1
+      throw new IOException(s"Failed ${failureCount} times")
+    }
+
+    override def write(b: Array[Byte], off: Int, len: Int): Unit = {
+      failureCount += 1
+      throw new IOException(s"Failed ${failureCount} times")
+    }
+  }
+
+  def assertThrowsWithMsg[T](fn: Callable[T], clue: String,
+      expectedMsgPrefix: String): Unit = {
+    withClue(clue) {
+      try {
+        fn.call()
+      } catch {
+        case t: Throwable =>
+          assertIOExceptionMsg(t, expectedMsgPrefix)
+      }
+    }
+  }
+
+  def assertIOExceptionMsg(t: Throwable, expectedMsgPrefix: String): Unit = {
+    if (t.getClass.isAssignableFrom(classOf[IOException])) {
+      if (!t.getMessage.contains(expectedMsgPrefix)) {
+        fail(s"Unexpected exception message: ${t.getMessage}")
+      }
+    } else {
+      if (t.getCause != null) {
+        assertIOExceptionMsg(t.getCause, expectedMsgPrefix)
+      } else {
+        fail(s"Unexpected exception: $t")
+      }
+    }
+  }
+
+  test("write after error") {
+    val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController)
+
+    // The first call to `write` should succeed
+    os.write(buf)
+
+    // Wait for the first write to fail
+    while (os.lastError.get().isEmpty) {
+      Thread.sleep(100)
+    }
+
+    // The second `write` call should fail with the exception thrown by the first write failure
+    assertThrowsWithMsg(() => os.write(buf),
+      "The second write should fail with the exception thrown by the first write failure",
+      "Failed 1 times")
+
+    // `close` throws the same exception
+    assertThrowsWithMsg(() => os.close(),
+      "The second write should fail with the exception thrown by the first write failure",
+      "Failed 1 times")
+
+    assertResult(bufLen)(os.metrics.numBytesScheduled)
+    assertResult(0)(os.metrics.numBytesWritten.get())
+    assert(os.lastError.get().get.isInstanceOf[IOException])
+  }
+
+  test("flush after error") {
+    val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController)
+
+    // The first write should succeed
+    os.write(buf)
+
+    // The flush should fail with the exception thrown by the write failure
+    assertThrowsWithMsg(() => os.flush(),
+      "The flush should fail with the exception thrown by the write failure",
+      "Failed 1 times")
+
+    // `close` throws the same exception
+    assertThrowsWithMsg(() => os.close(),
+      "The flush should fail with the exception thrown by the write failure",
+      "Failed 1 times")
+  }
+
+  test("close after error") {
+    val os = new AsyncOutputStream(() => new ThrowingOutputStream, trafficController)
+
+    os.write(buf)
+
+    assertThrowsWithMsg(() => os.close(),
+      "Close should fail with the exception thrown by the write failure",
+      "Failed 1 times")
+  }
+}
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
new file mode 100644
index 00000000000..a8acf240878
--- /dev/null
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.util.concurrent.{Callable, CountDownLatch, ExecutionException, Executors, Future, RejectedExecutionException, TimeUnit}
+
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.funsuite.AnyFunSuite
+
+class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
+
+  // Some tests might take longer than usual in the limited CI environment.
+  // Use a long timeout to avoid flakiness.
+  val longTimeoutSec = 5
+
+  var throttle: HostMemoryThrottle = _
+  var trafficController: TrafficController = _
+  var executor: ThrottlingExecutor = _
+
+  class TestTask extends Callable[Unit] {
+    val latch = new CountDownLatch(1)
+    override def call(): Unit = {
+      latch.await()
+    }
+  }
+
+  override def beforeEach(): Unit = {
+    throttle = new HostMemoryThrottle(100)
+    trafficController = new TrafficController(throttle)
+    executor = new ThrottlingExecutor(
+      Executors.newSingleThreadExecutor(),
+      trafficController
+    )
+  }
+
+  override def afterEach(): Unit = {
+    executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS)
+  }
+
+  test("tasks submitted should update the state") {
+    val task1 = new TestTask
+    val future1 = executor.submit(task1, 10)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(10)(throttle.getTotalHostMemoryBytes)
+
+    val task2 = new TestTask
+    val future2 = executor.submit(task2, 20)
+    assertResult(2)(trafficController.numScheduledTasks)
+    assertResult(30)(throttle.getTotalHostMemoryBytes)
+
+    task1.latch.countDown()
+    future1.get(longTimeoutSec, TimeUnit.SECONDS)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(20)(throttle.getTotalHostMemoryBytes)
+
+    task2.latch.countDown()
+    future2.get(longTimeoutSec, TimeUnit.SECONDS)
+    assertResult(0)(trafficController.numScheduledTasks)
+    assertResult(0)(throttle.getTotalHostMemoryBytes)
+  }
+
+  test("tasks submission fails if total weight exceeds maxWeight") {
+    val task1 = new TestTask
+    val future1 = executor.submit(task1, 10)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(10)(throttle.getTotalHostMemoryBytes)
+
+    val task2 = new TestTask
+    val task2Weight = 100
+    val exec = Executors.newSingleThreadExecutor()
+    val future2 = exec.submit(new Runnable {
+      override def run(): Unit = executor.submit(task2, task2Weight)
+    })
+    Thread.sleep(100)
+    assert(!future2.isDone)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(10)(throttle.getTotalHostMemoryBytes)
+
+    task1.latch.countDown()
+    future1.get(longTimeoutSec, TimeUnit.SECONDS)
+    future2.get(longTimeoutSec, TimeUnit.SECONDS)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(task2Weight)(throttle.getTotalHostMemoryBytes)
+  }
+
+  test("submit one task heavier than maxWeight") {
+    val future = executor.submit(() => Thread.sleep(10), throttle.maxInFlightHostMemoryBytes + 1)
+    future.get(longTimeoutSec, TimeUnit.SECONDS)
+    assert(future.isDone)
+    assertResult(0)(trafficController.numScheduledTasks)
+    assertResult(0)(throttle.getTotalHostMemoryBytes)
+  }
+
+  test("submit multiple tasks such that total weight does not exceed maxWeight") {
+    val numTasks = 10
+    val taskRunTime = 10
+    var future: Future[Unit] = null
+    for (_ <- 0 to numTasks) {
+      future = executor.submit(() => Thread.sleep(taskRunTime), 1)
+    }
+    // Give enough time for all tasks to complete
+    future.get(numTasks * taskRunTime * 5, TimeUnit.MILLISECONDS)
+    assertResult(0)(trafficController.numScheduledTasks)
+    assertResult(0)(throttle.getTotalHostMemoryBytes)
+  }
+
+  test("shutdown while a task is blocked") {
+    val task1 = new TestTask
+    val future1 = executor.submit(task1, 10)
+    assertResult(1)(trafficController.numScheduledTasks)
+    assertResult(10)(throttle.getTotalHostMemoryBytes)
+
+    val task2 = new TestTask
+    val task2Weight = 100
+    val exec = Executors.newSingleThreadExecutor()
+    val future2 = exec.submit(new Runnable {
+      override def run(): Unit = executor.submit(task2, task2Weight)
+    })
+    executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS)
+
+    def assertCause(t: Throwable, cause: Class[_]): Unit = {
+      assert(t.getCause != null)
+      assert(cause.isInstance(t.getCause))
+    }
+
+    val e1 = intercept[ExecutionException](future1.get())
+    assertCause(e1, classOf[InterruptedException])
+    val e2 = intercept[ExecutionException](future2.get())
+    assertCause(e2, classOf[RejectedExecutionException])
+  }
+}
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
new file mode 100644
index 00000000000..32868ff6055
--- /dev/null
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async
+
+import java.util.concurrent.{ExecutionException, Executors, ExecutorService, TimeUnit}
+
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.funsuite.AnyFunSuite
+
+class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach {
+
+  private var throttle: HostMemoryThrottle = _
+  private var controller: TrafficController = _
+  private var executor: ExecutorService = _
+
+  override def beforeEach(): Unit = {
+    throttle = new HostMemoryThrottle(100)
+    controller = new TrafficController(throttle)
+    executor = Executors.newSingleThreadExecutor()
+  }
+
+  override def afterEach(): Unit = {
+    executor.shutdownNow()
+    executor.awaitTermination(1, TimeUnit.SECONDS)
+  }
+
+  class TestTask(taskMemoryBytes: Long) extends Task[Unit](taskMemoryBytes, () => {}) {}
+
+  test("schedule tasks without blocking") {
+    val taskMemoryBytes = 50
+    val t1 = new TestTask(taskMemoryBytes)
+    controller.blockUntilRunnable(t1)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(taskMemoryBytes)(throttle.getTotalHostMemoryBytes)
+
+    val t2 = new TestTask(50)
+    controller.blockUntilRunnable(t2)
+    assertResult(2)(controller.numScheduledTasks)
+    assertResult(2 * taskMemoryBytes)(throttle.getTotalHostMemoryBytes)
+
+    controller.taskCompleted(t1)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(taskMemoryBytes)(throttle.getTotalHostMemoryBytes)
+  }
+
+  test("schedule task with blocking") {
+    val taskMemoryBytes = 50
+    val t1 = new TestTask(taskMemoryBytes)
+    controller.blockUntilRunnable(t1)
+
+    val t2 = new TestTask(taskMemoryBytes)
+    controller.blockUntilRunnable(t2)
+
+    val t3 = new TestTask(taskMemoryBytes)
+    val f = executor.submit(new Runnable {
+      override def run(): Unit = controller.blockUntilRunnable(t3)
+    })
+    Thread.sleep(100)
+    assert(!f.isDone)
+
+    controller.taskCompleted(t1)
+    f.get(1, TimeUnit.SECONDS)
+  }
+
+  test("shutdown while blocking") {
+    val t1 = new TestTask(10)
+    controller.blockUntilRunnable(t1)
+
+    val t2 = new TestTask(110)
+
+    val f = executor.submit(new Runnable {
+      override def run(): Unit = {
+        controller.blockUntilRunnable(t2)
+      }
+    })
+
+    executor.shutdownNow()
+    try {
+      f.get(1, TimeUnit.SECONDS)
+      fail("Should be interrupted")
+    } catch {
+      case ee: ExecutionException =>
+        assert(ee.getCause.isInstanceOf[InterruptedException])
+      case _: Throwable => fail("Should be interrupted")
+    }
+  }
+}

From abc3654ffda125b474441ba2cf3dd43ccdfb2483 Mon Sep 17 00:00:00 2001
From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com>
Date: Tue, 26 Nov 2024 08:22:44 +0800
Subject: [PATCH 05/37] remove excluded release shim and TODO (#11756)

* remove excluded release shim and TODO

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* remove shim from 2.13 properties

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* Fix error: 'NoneType' object has no attribute 'split' for excluded_shims

Signed-off-by: timl <timl@nvidia.com>

---------

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>
Signed-off-by: timl <timl@nvidia.com>
Co-authored-by: timl <timl@nvidia.com>
---
 build/get_buildvers.py | 2 +-
 pom.xml                | 3 +--
 scala2.13/pom.xml      | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/build/get_buildvers.py b/build/get_buildvers.py
index 5fe864670b5..263003ea99f 100644
--- a/build/get_buildvers.py
+++ b/build/get_buildvers.py
@@ -34,7 +34,7 @@ def _get_buildvers(buildvers, pom_file, logger=None):
         else:
             no_snapshots.append(release)
     excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns)
-    if excluded_shims is not None:
+    if excluded_shims is not None and excluded_shims.text:
         for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]:
             if removed_shim in snapshots:
                 snapshots.remove(removed_shim)
diff --git a/pom.xml b/pom.xml
index 79a6a765470..7409b849968 100644
--- a/pom.xml
+++ b/pom.xml
@@ -813,8 +813,7 @@
 
     <properties>
         <!-- start dyn.shim properties -->
-        <!-- TODO: will package 350db143 shim into the dist jar in branch-25.02, followed by https://github.com/NVIDIA/spark-rapids/issues/11749 -->
-	<dyn.shim.excluded.releases>350db143</dyn.shim.excluded.releases>
+	<dyn.shim.excluded.releases></dyn.shim.excluded.releases>
         <!-- end dyn.shim properties -->
 
         <rapids.module>.</rapids.module>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index d1368d81d97..9c00390f6e5 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -813,8 +813,7 @@
 
     <properties>
         <!-- start dyn.shim properties -->
-        <!-- TODO: will package 350db143 shim into the dist jar in branch-25.02, followed by https://github.com/NVIDIA/spark-rapids/issues/11749 -->
-	<dyn.shim.excluded.releases>350db143</dyn.shim.excluded.releases>
+	<dyn.shim.excluded.releases></dyn.shim.excluded.releases>
         <!-- end dyn.shim properties -->
 
         <rapids.module>.</rapids.module>

From f5be35e2f50c6ebf64d7914f34fda36772c87729 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 25 Nov 2024 20:26:45 -0600
Subject: [PATCH 06/37] Fix Kudo batch serializer to only read header in
 hasNext (#11766)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 .../rapids/GpuColumnarBatchSerializer.scala   | 51 +++++++++++--------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 116b8b97504..44a58370c33 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -27,7 +27,7 @@ import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable}
+import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable, KudoTableHeader}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance}
@@ -495,47 +495,52 @@ object KudoSerializedTableColumn {
 
 class KudoSerializedBatchIterator(dIn: DataInputStream)
   extends BaseSerializedTableIterator {
-  private[this] var nextTable: Option[KudoTable] = None
+  private[this] var nextHeader: Option[KudoTableHeader] = None
   private[this] var streamClosed: Boolean = false
 
   // Don't install the callback if in a unit test
   Option(TaskContext.get()).foreach { tc =>
     onTaskCompletion(tc) {
-      nextTable.foreach(_.close())
-      nextTable = None
       dIn.close()
     }
   }
 
-  private def tryReadNext(): Unit = {
+  private def tryReadNextHeader(): Unit = {
     if (!streamClosed) {
-      withResource(new NvtxRange("Read Kudo Table", NvtxColor.YELLOW)) { _ =>
-        val kudoTable = KudoTable.from(dIn)
-        if (kudoTable.isPresent) {
-          nextTable = Some(kudoTable.get())
-        } else {
+      withResource(new NvtxRange("Read Kudo Header", NvtxColor.YELLOW)) { _ =>
+        require(nextHeader.isEmpty)
+        nextHeader = Option(KudoTableHeader.readFrom(dIn).orElse(null))
+        if (nextHeader.isEmpty) {
           dIn.close()
           streamClosed = true
-          nextTable = None
         }
       }
     }
   }
 
   override def hasNext: Boolean = {
-    nextTable match {
-      case Some(_) => true
-      case None =>
-        tryReadNext()
-        nextTable.isDefined
+    if (nextHeader.isEmpty) {
+      tryReadNextHeader()
     }
+    nextHeader.isDefined
   }
 
   override def next(): (Int, ColumnarBatch) = {
     if (hasNext) {
-      val ret = KudoSerializedTableColumn.from(nextTable.get)
-      nextTable = None
-      (0, ret)
+      val header = nextHeader.get
+      nextHeader = None
+      val buffer = if (header.getNumColumns == 0) {
+        null
+      } else {
+        withResource(new NvtxRange("Read Kudo Body", NvtxColor.YELLOW)) { _ =>
+          val buffer = HostMemoryBuffer.allocate(header.getTotalDataLen, false)
+          closeOnExcept(buffer) { _ =>
+            buffer.copyFromStream(0, dIn, header.getTotalDataLen)
+          }
+          buffer
+        }
+      }
+      (0, KudoSerializedTableColumn.from(new KudoTable(header, buffer)))
     } else {
       throw new NoSuchElementException("Walked off of the end...")
     }
@@ -547,7 +552,9 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
    * @return the length of the data to read, or None if the stream is closed or ended
    */
   override def peekNextBatchSize(): Option[Long] = {
-    tryReadNext()
-    nextTable.flatMap(t => Option(t.getBuffer)).map(_.getLength)
+    if (nextHeader.isEmpty) {
+      tryReadNextHeader()
+    }
+    nextHeader.map(_.getTotalDataLen)
   }
-}
\ No newline at end of file
+}

From 2b6ac118112c973a7848cb4fc7a26ab68797fb4b Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gshegalov@nvidia.com>
Date: Tue, 26 Nov 2024 06:36:21 -0800
Subject: [PATCH 07/37] Avoid using StringBuffer in single-threaded methods.
 (#11759)

Signed-off-by: Gera Shegalov <gshegalov@nvidia.com>
---
 .../org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala
index 97d271b076f..0dd048967a8 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/test/cpuJsonExpressions.scala
@@ -203,7 +203,7 @@ object GetJsonObjectMask {
       oneToOneMap: Map[Char, Char],
       digitMap: Map[Char, Char]): String = {
     if (originStr != null) {
-      val buf = new StringBuffer(originStr.length)
+      val buf = new StringBuilder(originStr.length)
       var idx = 0
       while (idx < originStr.length) {
         val originChar = originStr(idx)

From e3dce9ec393d84f68c00da6e5631f67abffe94e0 Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Tue, 26 Nov 2024 23:11:24 +0800
Subject: [PATCH 08/37] Fix query hang when using rapids multithread shuffle
 manager with kudo (#11771)

* Fix query hang when using kudo and multi thread shuffle manager

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>

* Fix NPE

---------

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>
---
 .../rapids/GpuColumnarBatchSerializer.scala   | 94 +++++++++++--------
 .../RapidsShuffleInternalManagerBase.scala    |  1 -
 2 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 44a58370c33..54252253d38 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -319,10 +319,12 @@ object SerializedTableColumn {
     if (batch.numCols == 1) {
       val cv = batch.column(0)
       cv match {
-        case serializedTableColumn: SerializedTableColumn
-          if serializedTableColumn.hostBuffer != null =>
-          sum += serializedTableColumn.hostBuffer.getLength
+        case serializedTableColumn: SerializedTableColumn =>
+          sum += Option(serializedTableColumn.hostBuffer).map(_.getLength).getOrElse(0L)
+        case kudo: KudoSerializedTableColumn =>
+          sum += Option(kudo.kudoTable.getBuffer).map(_.getLength).getOrElse(0L)
         case _ =>
+          throw new IllegalStateException(s"Unexpected column type: ${cv.getClass}" )
       }
     }
     sum
@@ -496,65 +498,75 @@ object KudoSerializedTableColumn {
 class KudoSerializedBatchIterator(dIn: DataInputStream)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[KudoTableHeader] = None
+  private[this] var toBeReturned: Option[ColumnarBatch] = None
   private[this] var streamClosed: Boolean = false
 
   // Don't install the callback if in a unit test
   Option(TaskContext.get()).foreach { tc =>
     onTaskCompletion(tc) {
+      toBeReturned.foreach(_.close())
+      toBeReturned = None
       dIn.close()
     }
   }
 
-  private def tryReadNextHeader(): Unit = {
-    if (!streamClosed) {
-      withResource(new NvtxRange("Read Kudo Header", NvtxColor.YELLOW)) { _ =>
-        require(nextHeader.isEmpty)
-        nextHeader = Option(KudoTableHeader.readFrom(dIn).orElse(null))
-        if (nextHeader.isEmpty) {
-          dIn.close()
-          streamClosed = true
+  override def peekNextBatchSize(): Option[Long] = {
+    if (streamClosed) {
+      None
+    } else {
+      if (nextHeader.isEmpty) {
+        withResource(new NvtxRange("Read Header", NvtxColor.YELLOW)) { _ =>
+          val header = Option(KudoTableHeader.readFrom(dIn).orElse(null))
+          if (header.isDefined) {
+            nextHeader = header
+          } else {
+            dIn.close()
+            streamClosed = true
+            nextHeader = None
+          }
         }
       }
+      nextHeader.map(_.getTotalDataLen)
     }
   }
 
-  override def hasNext: Boolean = {
+  private def tryReadNext(): Option[ColumnarBatch] = {
     if (nextHeader.isEmpty) {
-      tryReadNextHeader()
-    }
-    nextHeader.isDefined
-  }
-
-  override def next(): (Int, ColumnarBatch) = {
-    if (hasNext) {
-      val header = nextHeader.get
-      nextHeader = None
-      val buffer = if (header.getNumColumns == 0) {
-        null
-      } else {
-        withResource(new NvtxRange("Read Kudo Body", NvtxColor.YELLOW)) { _ =>
-          val buffer = HostMemoryBuffer.allocate(header.getTotalDataLen, false)
-          closeOnExcept(buffer) { _ =>
-            buffer.copyFromStream(0, dIn, header.getTotalDataLen)
+      None
+    } else {
+      withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
+        val header = nextHeader.get
+        if (header.getNumColumns > 0) {
+          // This buffer will later be concatenated into another host buffer before being
+          // sent to the GPU, so no need to use pinned memory for these buffers.
+          closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer =>
+            hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen)
+            val kudoTable = new KudoTable(header, hostBuffer)
+            Some(KudoSerializedTableColumn.from(kudoTable))
           }
-          buffer
+        } else {
+          Some(KudoSerializedTableColumn.from(new KudoTable(header, null)))
         }
       }
-      (0, KudoSerializedTableColumn.from(new KudoTable(header, buffer)))
-    } else {
-      throw new NoSuchElementException("Walked off of the end...")
     }
   }
 
-  /**
-   * Attempt to read the next header from the stream.
-   *
-   * @return the length of the data to read, or None if the stream is closed or ended
-   */
-  override def peekNextBatchSize(): Option[Long] = {
-    if (nextHeader.isEmpty) {
-      tryReadNextHeader()
+  override def hasNext: Boolean = {
+    peekNextBatchSize()
+    nextHeader.isDefined
+  }
+
+  override def next(): (Int, ColumnarBatch) = {
+    if (toBeReturned.isEmpty) {
+      peekNextBatchSize()
+      toBeReturned = tryReadNext()
+      if (nextHeader.isEmpty || toBeReturned.isEmpty) {
+        throw new NoSuchElementException("Walked off of the end...")
+      }
     }
-    nextHeader.map(_.getTotalDataLen)
+    val ret = toBeReturned.get
+    toBeReturned = None
+    nextHeader = None
+    (0, ret)
   }
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
index a44580c3bf5..05bc76c3fab 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
@@ -21,7 +21,6 @@ import java.util.Optional
 import java.util.concurrent.{Callable, ConcurrentHashMap, ExecutionException, Executors, Future, LinkedBlockingQueue, TimeUnit}
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
-import scala.collection
 import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 

From 4fa0a1dee986e05733dbdbf4971c42ad5e0e84ec Mon Sep 17 00:00:00 2001
From: "Hongbin Ma (Mahone)" <mahongbin@apache.org>
Date: Tue, 26 Nov 2024 23:44:45 +0800
Subject: [PATCH 09/37] repartition-based fallback for hash aggregate v3
 (#11712)

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>
Signed-off-by: Firestarman <firestarmanllc@gmail.com>
Co-authored-by: Firestarman <firestarmanllc@gmail.com>
---
 .../scala/com/nvidia/spark/rapids/Arm.scala   |  16 +-
 .../rapids/AutoClosableArrayBuffer.scala      |  54 ++
 .../spark/rapids/GpuAggregateExec.scala       | 725 ++++++++++--------
 .../com/nvidia/spark/rapids/GpuExec.scala     |   6 +
 ...GpuUnboundedToUnboundedAggWindowExec.scala |  29 +-
 5 files changed, 476 insertions(+), 354 deletions(-)
 create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala
index 926f770a683..b0cd798c179 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Arm.scala
@@ -16,7 +16,7 @@
 package com.nvidia.spark.rapids
 
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.util.control.ControlThrowable
 
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
@@ -134,6 +134,20 @@ object Arm extends ArmScalaSpecificImpl {
     }
   }
 
+  /** Executes the provided code block, closing the resources only if an exception occurs */
+  def closeOnExcept[T <: AutoCloseable, V](r: ListBuffer[T])(block: ListBuffer[T] => V): V = {
+    try {
+      block(r)
+    } catch {
+      case t: ControlThrowable =>
+        // Don't close for these cases..
+        throw t
+      case t: Throwable =>
+        r.safeClose(t)
+        throw t
+    }
+  }
+
 
   /** Executes the provided code block, closing the resources only if an exception occurs */
   def closeOnExcept[T <: AutoCloseable, V](r: mutable.Queue[T])(block: mutable.Queue[T] => V): V = {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala
new file mode 100644
index 00000000000..fb1e10b9c9e
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AutoClosableArrayBuffer.scala
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+/**
+ * Just a simple wrapper to make working with buffers of AutoClosable things play
+ * nicely with withResource.
+ */
+class AutoClosableArrayBuffer[T <: AutoCloseable] extends AutoCloseable {
+  val data = new ArrayBuffer[T]()
+
+  def append(scb: T): Unit = data.append(scb)
+
+  def last: T = data.last
+
+  def removeLast(): T = data.remove(data.length - 1)
+
+  def foreach[U](f: T => U): Unit = data.foreach(f)
+
+  def map[U](f: T => U): Seq[U] = data.map(f).toSeq
+
+  def toArray[B >: T : ClassTag]: Array[B] = data.toArray
+
+  def size(): Int = data.size
+
+  def clear(): Unit = data.clear()
+
+  def forall(p: T => Boolean): Boolean = data.forall(p)
+
+  def iterator: Iterator[T] = data.iterator
+
+  override def toString: String = s"AutoCloseable(${super.toString})"
+
+  override def close(): Unit = {
+    data.foreach(_.close())
+    data.clear()
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
index b5360a62f94..60f6dd68509 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
@@ -16,11 +16,9 @@
 
 package com.nvidia.spark.rapids
 
-import java.util
-
 import scala.annotation.tailrec
-import scala.collection.JavaConverters.collectionAsScalaIterableConverter
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
@@ -37,7 +35,7 @@ import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Attribute, AttributeReference, AttributeSeq, AttributeSet, Expression, ExprId, If, NamedExpression, NullsFirst, SortOrder}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, Attribute, AttributeReference, AttributeSeq, AttributeSet, Expression, ExprId, If, NamedExpression, SortOrder}
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, HashPartitioning, Partitioning, UnspecifiedDistribution}
@@ -47,11 +45,11 @@ import org.apache.spark.sql.execution.{ExplainUtils, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.aggregate.{CpuToGpuAggregateBufferConverter, CudfAggregate, GpuAggregateExpression, GpuToCpuAggregateBufferConverter}
-import org.apache.spark.sql.rapids.execution.{GpuShuffleMeta, TrampolineUtil}
+import org.apache.spark.sql.rapids.execution.{GpuBatchSubPartitioner, GpuShuffleMeta, TrampolineUtil}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
-object AggregateUtils {
+object AggregateUtils extends Logging {
 
   private val aggs = List("min", "max", "avg", "sum", "count", "first", "last")
 
@@ -98,8 +96,10 @@ object AggregateUtils {
       inputTypes: Seq[DataType],
       outputTypes: Seq[DataType],
       isReductionOnly: Boolean): Long = {
+
     def typesToSize(types: Seq[DataType]): Long =
       types.map(GpuBatchUtils.estimateGpuMemory(_, nullable = false, rowCount = 1)).sum
+
     val inputRowSize = typesToSize(inputTypes)
     val outputRowSize = typesToSize(outputTypes)
     // The cudf hash table implementation allocates four 32-bit integers per input row.
@@ -120,22 +120,198 @@ object AggregateUtils {
     }
 
     // Calculate the max rows that can be processed during computation within the budget
-    val maxRows = totalBudget / computationBytesPerRow
+    // Make sure it's not less than 1, otherwise some corner test cases may fail
+    val maxRows = Math.max(totalBudget / computationBytesPerRow, 1)
 
     // Finally compute the input target batching size taking into account the cudf row limits
     Math.min(inputRowSize * maxRows, Int.MaxValue)
   }
+
+  /**
+   * Concatenate batches together and perform a merge aggregation on the result. The input batches
+   * will be closed as part of this operation.
+   *
+   * @param batches batches to concatenate and merge aggregate
+   * @return lazy spillable batch which has NOT been marked spillable
+   */
+  def concatenateAndMerge(
+      batches: mutable.ArrayBuffer[SpillableColumnarBatch],
+      metrics: GpuHashAggregateMetrics,
+      concatAndMergeHelper: AggHelper): SpillableColumnarBatch = {
+    // TODO: concatenateAndMerge (and calling code) could output a sequence
+    //   of batches for the partial aggregate case. This would be done in case
+    //   a retry failed a certain number of times.
+    val concatBatch = withResource(batches) { _ =>
+      val concatSpillable = concatenateBatches(metrics, batches.toSeq)
+      withResource(concatSpillable) {
+        _.getColumnarBatch()
+      }
+    }
+    computeAggregateAndClose(metrics, concatBatch, concatAndMergeHelper)
+  }
+
+  /**
+   * Try to concat and merge neighbour input batches to reduce the number of output batches.
+   * For some cases where input is highly aggregate-able, we can merge multiple input batches
+   * into a single output batch. In such cases we can skip repartition at all.
+   */
+  def streamAggregateNeighours(
+      aggregatedBatches: CloseableBufferedIterator[SpillableColumnarBatch],
+      metrics: GpuHashAggregateMetrics,
+      targetMergeBatchSize: Long,
+      configuredTargetBatchSize: Long,
+      helper: AggHelper
+  ): Iterator[SpillableColumnarBatch] = {
+    new Iterator[SpillableColumnarBatch] {
+
+      override def hasNext: Boolean = aggregatedBatches.hasNext
+
+      override def next(): SpillableColumnarBatch = {
+        closeOnExcept(new ArrayBuffer[SpillableColumnarBatch]) { stagingBatches => {
+          var currentSize = 0L
+          while (aggregatedBatches.hasNext) {
+            val nextBatch = aggregatedBatches.head
+            if (currentSize + nextBatch.sizeInBytes > targetMergeBatchSize) {
+              if (stagingBatches.size == 1) {
+                return stagingBatches.head
+              } else if (stagingBatches.isEmpty) {
+                aggregatedBatches.next
+                return nextBatch
+              }
+              val merged = concatenateAndMerge(stagingBatches, metrics, helper)
+              stagingBatches.clear
+              currentSize = 0L
+              if (merged.sizeInBytes < configuredTargetBatchSize * 0.5) {
+                stagingBatches += merged
+                currentSize += merged.sizeInBytes
+              } else {
+                return merged
+              }
+            } else {
+              stagingBatches.append(nextBatch)
+              currentSize += nextBatch.sizeInBytes
+              aggregatedBatches.next
+            }
+          }
+
+          if (stagingBatches.size == 1) {
+            return stagingBatches.head
+          }
+          concatenateAndMerge(stagingBatches, metrics, helper)
+        }
+        }
+      }
+    }
+  }
+
+  /**
+   * Read the input batches and repartition them into buckets.
+   */
+  def iterateAndRepartition(
+      aggregatedBatches: Iterator[SpillableColumnarBatch],
+      metrics: GpuHashAggregateMetrics,
+      targetMergeBatchSize: Long,
+      helper: AggHelper,
+      hashKeys: Seq[GpuExpression],
+      hashBucketNum: Int,
+      hashSeed: Int,
+      batchesByBucket: ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]]
+  ): Boolean = {
+
+    var repartitionHappened = false
+    if (hashSeed > 200) {
+      throw new IllegalStateException("Too many times of repartition, may hit a bug?")
+    }
+
+    def repartitionAndClose(batch: SpillableColumnarBatch): Unit = {
+
+      // OPTIMIZATION
+      if (!aggregatedBatches.hasNext && batchesByBucket.forall(_.size() == 0)) {
+        // If this is the only batch (after merging neighbours) to be repartitioned,
+        // we can just add it to the first bucket and skip repartitioning.
+        // This is a common case when total input size can fit into a single batch.
+        batchesByBucket.head.append(batch)
+        return
+      }
+
+      withResource(new NvtxWithMetrics("agg repartition",
+        NvtxColor.CYAN, metrics.repartitionTime)) { _ =>
+
+        withResource(new GpuBatchSubPartitioner(
+          Seq(batch).map(batch => {
+            withResource(batch) { _ =>
+              batch.getColumnarBatch()
+            }
+          }).iterator,
+          hashKeys, hashBucketNum, hashSeed, "aggRepartition")) {
+          partitioner => {
+            (0 until partitioner.partitionsCount).foreach { id =>
+              closeOnExcept(batchesByBucket) { _ => {
+                val newBatches = partitioner.releaseBatchesByPartition(id)
+                newBatches.foreach { newBatch =>
+                  if (newBatch.numRows() > 0) {
+                    batchesByBucket(id).append(newBatch)
+                  } else {
+                    newBatch.safeClose()
+                  }
+                }
+              }
+              }
+            }
+          }
+        }
+      }
+      repartitionHappened = true
+    }
+
+    while (aggregatedBatches.hasNext) {
+      repartitionAndClose(aggregatedBatches.next)
+    }
+
+    // Deal with the over sized buckets
+    def needRepartitionAgain(bucket: AutoClosableArrayBuffer[SpillableColumnarBatch]) = {
+      bucket.map(_.sizeInBytes).sum > targetMergeBatchSize &&
+        bucket.size() != 1 &&
+        !bucket.forall(_.numRows() == 1) // this is for test
+    }
+
+    if (repartitionHappened && batchesByBucket.exists(needRepartitionAgain)) {
+      logDebug("Some of the repartition buckets are over sized, trying to split them")
+
+      val newBuckets = batchesByBucket.flatMap(bucket => {
+        if (needRepartitionAgain(bucket)) {
+          val nextLayerBuckets =
+            ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]())
+          // Recursively merge and repartition the over sized bucket
+          repartitionHappened =
+            iterateAndRepartition(
+              new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize,
+              helper, hashKeys, hashBucketNum, hashSeed + 7,
+              nextLayerBuckets) || repartitionHappened
+          nextLayerBuckets
+        } else {
+          ArrayBuffer.apply(bucket)
+        }
+      })
+      batchesByBucket.clear()
+      batchesByBucket.appendAll(newBuckets)
+    }
+
+    repartitionHappened
+  }
 }
 
 /** Utility class to hold all of the metrics related to hash aggregation */
 case class GpuHashAggregateMetrics(
     numOutputRows: GpuMetric,
     numOutputBatches: GpuMetric,
-    numTasksFallBacked: GpuMetric,
+    numTasksRepartitioned: GpuMetric,
+    numTasksSkippedAgg: GpuMetric,
     opTime: GpuMetric,
     computeAggTime: GpuMetric,
     concatTime: GpuMetric,
     sortTime: GpuMetric,
+    repartitionTime: GpuMetric,
     numAggOps: GpuMetric,
     numPreSplits: GpuMetric,
     singlePassTasks: GpuMetric,
@@ -208,7 +384,7 @@ class AggHelper(
 
   private val groupingAttributes = groupingExpressions.map(_.toAttribute)
   private val aggBufferAttributes = groupingAttributes ++
-      aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
 
   // `GpuAggregateFunction` can add a pre and post step for update
   // and merge aggregates.
@@ -228,7 +404,7 @@ class AggHelper(
   postStep ++= groupingAttributes
   postStepAttr ++= groupingAttributes
   postStepDataTypes ++=
-      groupingExpressions.map(_.dataType)
+    groupingExpressions.map(_.dataType)
 
   private var ix = groupingAttributes.length
   for (aggExp <- aggregateExpressions) {
@@ -380,9 +556,9 @@ class AggHelper(
     withResource(new NvtxRange("groupby", NvtxColor.BLUE)) { _ =>
       withResource(GpuColumnVector.from(preProcessed)) { preProcessedTbl =>
         val groupOptions = cudf.GroupByOptions.builder()
-            .withIgnoreNullKeys(false)
-            .withKeysSorted(doSortAgg)
-            .build()
+          .withIgnoreNullKeys(false)
+          .withKeysSorted(doSortAgg)
+          .build()
 
         val cudfAggsOnColumn = cudfAggregates.zip(aggOrdinals).map {
           case (cudfAgg, ord) => cudfAgg.groupByAggregate.onColumn(ord)
@@ -390,8 +566,8 @@ class AggHelper(
 
         // perform the aggregate
         val aggTbl = preProcessedTbl
-            .groupBy(groupOptions, groupingOrdinals: _*)
-            .aggregate(cudfAggsOnColumn.toSeq: _*)
+          .groupBy(groupOptions, groupingOrdinals: _*)
+          .aggregate(cudfAggsOnColumn.toSeq: _*)
 
         withResource(aggTbl) { _ =>
           GpuColumnVector.from(aggTbl, postStepDataTypes.toArray)
@@ -555,8 +731,8 @@ object GpuAggFirstPassIterator {
       metrics: GpuHashAggregateMetrics
   ): Iterator[SpillableColumnarBatch] = {
     val preprocessProjectIter = cbIter.map { cb =>
-      val sb = SpillableColumnarBatch (cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
-      aggHelper.preStepBound.projectAndCloseWithRetrySingleBatch (sb)
+      val sb = SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
+      aggHelper.preStepBound.projectAndCloseWithRetrySingleBatch(sb)
     }
     computeAggregateWithoutPreprocessAndClose(metrics, preprocessProjectIter, aggHelper)
   }
@@ -597,18 +773,18 @@ object GpuAggFinalPassIterator {
       modeInfo: AggregateModeInfo): BoundExpressionsModeAggregates = {
     val groupingAttributes = groupingExpressions.map(_.toAttribute)
     val aggBufferAttributes = groupingAttributes ++
-        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
 
     val boundFinalProjections = if (modeInfo.hasFinalMode || modeInfo.hasCompleteMode) {
       val finalProjections = groupingAttributes ++
-          aggregateExpressions.map(_.aggregateFunction.evaluateExpression)
+        aggregateExpressions.map(_.aggregateFunction.evaluateExpression)
       Some(GpuBindReferences.bindGpuReferences(finalProjections, aggBufferAttributes))
     } else {
       None
     }
 
     // allAttributes can be different things, depending on aggregation mode:
-    // - Partial mode: grouping key + cudf aggregates (e.g. no avg, intead sum::count
+    // - Partial mode: grouping key + cudf aggregates (e.g. no avg, instead sum::count
     // - Final mode: grouping key + spark aggregates (e.g. avg)
     val finalAttributes = groupingAttributes ++ aggregateAttributes
 
@@ -689,17 +865,22 @@ object GpuAggFinalPassIterator {
 /**
  * Iterator that takes another columnar batch iterator as input and emits new columnar batches that
  * are aggregated based on the specified grouping and aggregation expressions. This iterator tries
- * to perform a hash-based aggregation but is capable of falling back to a sort-based aggregation
- * which can operate on data that is either larger than can be represented by a cudf column or
- * larger than can fit in GPU memory.
+ * to perform a hash-based aggregation but is capable of falling back to a repartition-based
+ * aggregation which can operate on data that is either larger than can be represented by a cudf
+ * column or larger than can fit in GPU memory.
+ *
+ * In general, GpuMergeAggregateIterator works in this flow:
  *
- * The iterator starts by pulling all batches from the input iterator, performing an initial
- * projection and aggregation on each individual batch via `aggregateInputBatches()`. The resulting
- * aggregated batches are cached in memory as spillable batches. Once all input batches have been
- * aggregated, `tryMergeAggregatedBatches()` is called to attempt a merge of the aggregated batches
- * into a single batch. If this is successful then the resulting batch can be returned, otherwise
- * `buildSortFallbackIterator` is used to sort the aggregated batches by the grouping keys and
- * performs a final merge aggregation pass on the sorted batches.
+ * (1) The iterator starts by pulling all batches from the input iterator, performing an initial
+ * projection and aggregation on each individual batch via `GpuAggFirstPassIterator`, we call it
+ * "First Pass Aggregate".
+ * (2) Then the batches after first pass agg is sent to "streamAggregateNeighours", where it tries
+ * to concat & merge the neighbour batches into fewer batches, then "iterateAndRepartition"
+ * repartition the batch into fixed size buckets. Recursive repartition will be applied on
+ * over-sized buckets until each bucket * is within the target size.
+ * We call this phase "Second Pass Aggregate".
+ * (3) At "Third Pass Aggregate", we take each bucket and perform a final aggregation on all batches
+ * in the bucket, check "RepartitionAggregateIterator" for details.
  *
  * @param firstPassIter iterator that has done a first aggregation pass over the input data.
  * @param inputAttributes input attributes to identify the input columns from the input batches
@@ -710,13 +891,12 @@ object GpuAggFinalPassIterator {
  * @param modeInfo identifies which aggregation modes are being used
  * @param metrics metrics that will be updated during aggregation
  * @param configuredTargetBatchSize user-specified value for the targeted input batch size
- * @param useTieredProject user-specified option to enable tiered projections
  * @param allowNonFullyAggregatedOutput if allowed to skip third pass Agg
  * @param skipAggPassReductionRatio skip if the ratio of rows after a pass is bigger than this value
  * @param localInputRowsCount metric to track the number of input rows processed locally
  */
 class GpuMergeAggregateIterator(
-    firstPassIter: Iterator[SpillableColumnarBatch],
+    firstPassIter: CloseableBufferedIterator[SpillableColumnarBatch],
     inputAttributes: Seq[Attribute],
     groupingExpressions: Seq[NamedExpression],
     aggregateExpressions: Seq[GpuAggregateExpression],
@@ -728,18 +908,22 @@ class GpuMergeAggregateIterator(
     conf: SQLConf,
     allowNonFullyAggregatedOutput: Boolean,
     skipAggPassReductionRatio: Double,
-    localInputRowsCount: LocalGpuMetric)
-    extends Iterator[ColumnarBatch] with AutoCloseable with Logging {
+    localInputRowsCount: LocalGpuMetric
+)
+  extends Iterator[ColumnarBatch] with AutoCloseable with Logging {
   private[this] val isReductionOnly = groupingExpressions.isEmpty
   private[this] val targetMergeBatchSize = computeTargetMergeBatchSize(configuredTargetBatchSize)
-  private[this] val aggregatedBatches = new util.ArrayDeque[SpillableColumnarBatch]
-  private[this] var outOfCoreIter: Option[GpuOutOfCoreSortIterator] = None
 
-  /** Iterator for fetching aggregated batches either if:
-   * 1. a sort-based fallback has occurred
-   * 2. skip third pass agg has occurred
-   **/
-  private[this] var fallbackIter: Option[Iterator[ColumnarBatch]] = None
+  private[this] val defaultHashBucketNum = 16
+  private[this] val defaultHashSeed = 107
+  private[this] var batchesByBucket =
+    ArrayBuffer.fill(defaultHashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]())
+
+  private[this] var firstBatchChecked = false
+
+  private[this] var bucketIter: Option[RepartitionAggregateIterator] = None
+
+  private[this] var realIter: Option[Iterator[ColumnarBatch]] = None
 
   /** Whether a batch is pending for a reduction-only aggregation */
   private[this] var hasReductionOnlyBatch: Boolean = isReductionOnly
@@ -752,286 +936,168 @@ class GpuMergeAggregateIterator(
   }
 
   override def hasNext: Boolean = {
-    fallbackIter.map(_.hasNext).getOrElse {
+    realIter.map(_.hasNext).getOrElse {
       // reductions produce a result even if the input is empty
-      hasReductionOnlyBatch || !aggregatedBatches.isEmpty || firstPassIter.hasNext
+      hasReductionOnlyBatch || firstPassIter.hasNext
     }
   }
 
   override def next(): ColumnarBatch = {
-    fallbackIter.map(_.next()).getOrElse {
-      var shouldSkipThirdPassAgg = false
-
-      // aggregate and merge all pending inputs
-      if (firstPassIter.hasNext) {
-        // first pass agg
-        val rowsAfterFirstPassAgg = aggregateInputBatches()
-
-        // by now firstPassIter has been traversed, so localInputRowsCount is finished updating
-        if (isReductionOnly ||
-          skipAggPassReductionRatio * localInputRowsCount.value >= rowsAfterFirstPassAgg) {
-          // second pass agg
-          tryMergeAggregatedBatches()
-
-          val rowsAfterSecondPassAgg = aggregatedBatches.asScala.foldLeft(0L) {
-            (totalRows, batch) => totalRows + batch.numRows()
-          }
-          shouldSkipThirdPassAgg =
-            rowsAfterSecondPassAgg > skipAggPassReductionRatio * rowsAfterFirstPassAgg
-        } else {
-          shouldSkipThirdPassAgg = true
-          logInfo(s"Rows after first pass aggregation $rowsAfterFirstPassAgg exceeds " +
-            s"${skipAggPassReductionRatio * 100}% of " +
-            s"localInputRowsCount ${localInputRowsCount.value}, skip the second pass agg")
-        }
-      }
+    realIter.map(_.next()).getOrElse {
 
-      if (aggregatedBatches.size() > 1) {
-        // Unable to merge to a single output, so must fall back
-        if (allowNonFullyAggregatedOutput && shouldSkipThirdPassAgg) {
-          // skip third pass agg, return the aggregated batches directly
-          logInfo(s"Rows after second pass aggregation exceeds " +
-            s"${skipAggPassReductionRatio * 100}% of " +
-            s"rows after first pass, skip the third pass agg")
-          fallbackIter = Some(new Iterator[ColumnarBatch] {
-            override def hasNext: Boolean = !aggregatedBatches.isEmpty
-
-            override def next(): ColumnarBatch = {
-              withResource(aggregatedBatches.pop()) { spillableBatch =>
-                spillableBatch.getColumnarBatch()
-              }
-            }
-          })
-        } else {
-          // fallback to sort agg, this is the third pass agg
-          fallbackIter = Some(buildSortFallbackIterator())
+      // Handle reduction-only aggregation
+      if (isReductionOnly) {
+        val batches = ArrayBuffer.apply[SpillableColumnarBatch]()
+        while (firstPassIter.hasNext) {
+          batches += firstPassIter.next()
         }
-        fallbackIter.get.next()
-      } else if (aggregatedBatches.isEmpty) {
-        if (hasReductionOnlyBatch) {
+
+        if (batches.isEmpty || batches.forall(_.numRows() == 0)) {
           hasReductionOnlyBatch = false
-          generateEmptyReductionBatch()
+          return generateEmptyReductionBatch()
         } else {
-          throw new NoSuchElementException("batches exhausted")
+          hasReductionOnlyBatch = false
+          val concat = AggregateUtils.concatenateAndMerge(batches, metrics, concatAndMergeHelper)
+          return withResource(concat) { cb =>
+            cb.getColumnarBatch()
+          }
         }
-      } else {
-        // this will be the last batch
-        hasReductionOnlyBatch = false
-        withResource(aggregatedBatches.pop()) { spillableBatch =>
-          spillableBatch.getColumnarBatch()
+      }
+
+      // Handle the case of skipping second and third pass of aggregation
+      // This only work when spark.rapids.sql.agg.skipAggPassReductionRatio < 1
+      if (!firstBatchChecked && firstPassIter.hasNext
+        && allowNonFullyAggregatedOutput) {
+        firstBatchChecked = true
+
+        val peek = firstPassIter.head
+        // It's only based on first batch of first pass agg, so it's an estimate
+        val firstPassReductionRatioEstimate = 1.0 * peek.numRows() / localInputRowsCount.value
+        if (firstPassReductionRatioEstimate > skipAggPassReductionRatio) {
+          logDebug("Skipping second and third pass aggregation due to " +
+            "too high reduction ratio in first pass: " +
+            s"$firstPassReductionRatioEstimate")
+          // if so, skip any aggregation, return the origin batch directly
+
+          realIter = Some(ConcatIterator.apply(firstPassIter, configuredTargetBatchSize))
+          metrics.numTasksSkippedAgg += 1
+          return realIter.get.next()
+        } else {
+          logInfo(s"The reduction ratio in first pass is not high enough to skip " +
+            s"second and third pass aggregation: peek.numRows: ${peek.numRows()}, " +
+            s"localInputRowsCount.value: ${localInputRowsCount.value}")
         }
       }
+      firstBatchChecked = true
+
+      val groupingAttributes = groupingExpressions.map(_.toAttribute)
+      val aggBufferAttributes = groupingAttributes ++
+        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      val hashKeys: Seq[GpuExpression] =
+        GpuBindReferences.bindGpuReferences(groupingAttributes, aggBufferAttributes.toSeq)
+
+      val repartitionHappened = AggregateUtils.iterateAndRepartition(
+        AggregateUtils.streamAggregateNeighours(
+          firstPassIter,
+          metrics,
+          targetMergeBatchSize,
+          configuredTargetBatchSize,
+          concatAndMergeHelper)
+        , metrics, targetMergeBatchSize, concatAndMergeHelper,
+        hashKeys, defaultHashBucketNum, defaultHashSeed, batchesByBucket)
+      if (repartitionHappened) {
+        metrics.numTasksRepartitioned += 1
+      }
+
+      realIter = Some(ConcatIterator.apply(
+        new CloseableBufferedIterator(buildBucketIterator()), configuredTargetBatchSize))
+      realIter.get.next()
     }
   }
 
   override def close(): Unit = {
-    aggregatedBatches.forEach(_.safeClose())
-    aggregatedBatches.clear()
-    outOfCoreIter.foreach(_.close())
-    outOfCoreIter = None
-    fallbackIter = None
+    batchesByBucket.foreach(_.close())
+    batchesByBucket.clear()
     hasReductionOnlyBatch = false
   }
 
   private def computeTargetMergeBatchSize(confTargetSize: Long): Long = {
     val mergedTypes = groupingExpressions.map(_.dataType) ++ aggregateExpressions.map(_.dataType)
-    AggregateUtils.computeTargetBatchSize(confTargetSize, mergedTypes, mergedTypes,isReductionOnly)
+    AggregateUtils.computeTargetBatchSize(confTargetSize, mergedTypes, mergedTypes, isReductionOnly)
   }
 
-  /** Aggregate all input batches and place the results in the aggregatedBatches queue. */
-  private def aggregateInputBatches(): Long = {
-    var rowsAfter = 0L
-    // cache everything in the first pass
-    while (firstPassIter.hasNext) {
-      val batch = firstPassIter.next()
-      rowsAfter += batch.numRows()
-      aggregatedBatches.add(batch)
-    }
-    rowsAfter
-  }
+  private lazy val concatAndMergeHelper =
+    new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions,
+      forceMerge = true, conf, isSorted = false)
+
+  private case class ConcatIterator(
+      input: CloseableBufferedIterator[SpillableColumnarBatch],
+      targetSize: Long)
+    extends Iterator[ColumnarBatch] {
+
+    override def hasNext: Boolean = input.hasNext
+
+    override def next(): ColumnarBatch = {
+      // combine all the data into a single batch
+      val spillCbs = ArrayBuffer[SpillableColumnarBatch]()
+      var totalBytes = 0L
+      closeOnExcept(spillCbs) { _ =>
+        while (input.hasNext && (spillCbs.isEmpty ||
+          (totalBytes + input.head.sizeInBytes) < targetSize)) {
+          val tmp = input.next
+          totalBytes += tmp.sizeInBytes
+          spillCbs += tmp
+        }
 
-  /**
-   * Attempt to merge adjacent batches in the aggregatedBatches queue until either there is only
-   * one batch or merging adjacent batches would exceed the target batch size.
-   */
-  private def tryMergeAggregatedBatches(): Unit = {
-    while (aggregatedBatches.size() > 1) {
-      val concatTime = metrics.concatTime
-      val opTime = metrics.opTime
-      withResource(new NvtxWithMetrics("agg merge pass", NvtxColor.BLUE, concatTime,
-        opTime)) { _ =>
-        // continue merging as long as some batches are able to be combined
-        if (!mergePass()) {
-          if (aggregatedBatches.size() > 1 && isReductionOnly) {
-            // We were unable to merge the aggregated batches within the target batch size limit,
-            // which means normally we would fallback to a sort-based approach. However for
-            // reduction-only aggregation there are no keys to use for a sort. The only way this
-            // can work is if all batches are merged. This will exceed the target batch size limit,
-            // but at this point it is either risk an OOM/cudf error and potentially work or
-            // not work at all.
-            logWarning(s"Unable to merge reduction-only aggregated batches within " +
-                s"target batch limit of $targetMergeBatchSize, attempting to merge remaining " +
-                s"${aggregatedBatches.size()} batches beyond limit")
-            withResource(mutable.ArrayBuffer[SpillableColumnarBatch]()) { batchesToConcat =>
-              aggregatedBatches.forEach(b => batchesToConcat += b)
-              aggregatedBatches.clear()
-              val batch = concatenateAndMerge(batchesToConcat)
-              // batch does not need to be marked spillable since it is the last and only batch
-              // and will be immediately retrieved on the next() call.
-              aggregatedBatches.add(batch)
-            }
-          }
-          return
+        val concat = GpuAggregateIterator.concatenateBatches(metrics, spillCbs.toSeq)
+        withResource(concat) { _ =>
+          concat.getColumnarBatch()
         }
       }
     }
   }
 
-  /**
-   * Perform a single pass over the aggregated batches attempting to merge adjacent batches.
-   * @return true if at least one merge operation occurred
-   */
-  private def mergePass(): Boolean = {
-    val batchesToConcat: mutable.ArrayBuffer[SpillableColumnarBatch] = mutable.ArrayBuffer.empty
-    var wasBatchMerged = false
-    // Current size in bytes of the batches targeted for the next concatenation
-    var concatSize: Long = 0L
-    var batchesLeftInPass = aggregatedBatches.size()
-
-    while (batchesLeftInPass > 0) {
-      closeOnExcept(batchesToConcat) { _ =>
-        var isConcatSearchFinished = false
-        // Old batches are picked up at the front of the queue and freshly merged batches are
-        // appended to the back of the queue. Although tempting to allow the pass to "wrap around"
-        // and pick up batches freshly merged in this pass, it's avoided to prevent changing the
-        // order of aggregated batches.
-        while (batchesLeftInPass > 0 && !isConcatSearchFinished) {
-          val candidate = aggregatedBatches.getFirst
-          val potentialSize = concatSize + candidate.sizeInBytes
-          isConcatSearchFinished = concatSize > 0 && potentialSize > targetMergeBatchSize
-          if (!isConcatSearchFinished) {
-            batchesLeftInPass -= 1
-            batchesToConcat += aggregatedBatches.removeFirst()
-            concatSize = potentialSize
-          }
-        }
-      }
+  private case class RepartitionAggregateIterator(opTime: GpuMetric)
+    extends Iterator[SpillableColumnarBatch] {
 
-      val mergedBatch = if (batchesToConcat.length > 1) {
-        wasBatchMerged = true
-        concatenateAndMerge(batchesToConcat)
-      } else {
-        // Unable to find a neighboring buffer to produce a valid merge in this pass,
-        // so simply put this buffer back on the queue for other passes.
-        batchesToConcat.remove(0)
-      }
+    batchesByBucket = batchesByBucket.filter(_.size() > 0)
 
-      // Add the merged batch to the end of the aggregated batch queue. Only a single pass over
-      // the batches is being performed due to the batch count check above, so the single-pass
-      // loop will terminate before picking up this new batch.
-      aggregatedBatches.addLast(mergedBatch)
-      batchesToConcat.clear()
-      concatSize = 0
-    }
+    override def hasNext: Boolean = batchesByBucket.nonEmpty
 
-    wasBatchMerged
-  }
+    override def next(): SpillableColumnarBatch = {
+      withResource(new NvtxWithMetrics("RepartitionAggregateIterator.next",
+        NvtxColor.BLUE, opTime)) { _ =>
 
-  private lazy val concatAndMergeHelper =
-    new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions,
-      forceMerge = true, conf = conf)
-
-  /**
-   * Concatenate batches together and perform a merge aggregation on the result. The input batches
-   * will be closed as part of this operation.
-   * @param batches batches to concatenate and merge aggregate
-   * @return lazy spillable batch which has NOT been marked spillable
-   */
-  private def concatenateAndMerge(
-      batches: mutable.ArrayBuffer[SpillableColumnarBatch]): SpillableColumnarBatch = {
-    // TODO: concatenateAndMerge (and calling code) could output a sequence
-    //   of batches for the partial aggregate case. This would be done in case
-    //   a retry failed a certain number of times.
-    val concatBatch = withResource(batches) { _ =>
-      val concatSpillable = concatenateBatches(metrics, batches.toSeq)
-      withResource(concatSpillable) { _.getColumnarBatch() }
-    }
-    computeAggregateAndClose(metrics, concatBatch, concatAndMergeHelper)
-  }
-
-  /** Build an iterator that uses a sort-based approach to merge aggregated batches together. */
-  private def buildSortFallbackIterator(): Iterator[ColumnarBatch] = {
-    logInfo(s"Falling back to sort-based aggregation with ${aggregatedBatches.size()} batches")
-    metrics.numTasksFallBacked += 1
-    val aggregatedBatchIter = new Iterator[ColumnarBatch] {
-      override def hasNext: Boolean = !aggregatedBatches.isEmpty
+        if (batchesByBucket.last.size() == 1) {
+          batchesByBucket.remove(batchesByBucket.size - 1).removeLast()
+        } else {
+          // put as many buckets as possible together to aggregate, to reduce agg times
+          closeOnExcept(new ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]]) {
+            toAggregateBuckets =>
+              var currentSize = 0L
+              while (batchesByBucket.nonEmpty &&
+                batchesByBucket.last.size() + currentSize < targetMergeBatchSize) {
+                val bucket = batchesByBucket.remove(batchesByBucket.size - 1)
+                currentSize += bucket.map(_.sizeInBytes).sum
+                toAggregateBuckets += bucket
+              }
 
-      override def next(): ColumnarBatch = {
-        withResource(aggregatedBatches.removeFirst()) { spillable =>
-          spillable.getColumnarBatch()
+              AggregateUtils.concatenateAndMerge(
+                toAggregateBuckets.flatMap(_.data), metrics, concatAndMergeHelper)
+          }
         }
       }
     }
+  }
 
-    if (isReductionOnly) {
-      // Normally this should never happen because `tryMergeAggregatedBatches` should have done
-      // a last-ditch effort to concatenate all batches together regardless of target limits.
-      throw new IllegalStateException("Unable to fallback to sort-based aggregation " +
-          "without grouping keys")
-    }
-
-    val groupingAttributes = groupingExpressions.map(_.toAttribute)
-    val ordering = groupingAttributes.map(SortOrder(_, Ascending, NullsFirst, Seq.empty))
-    val aggBufferAttributes = groupingAttributes ++
-        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-    val sorter = new GpuSorter(ordering, aggBufferAttributes)
-    val aggBatchTypes = aggBufferAttributes.map(_.dataType)
 
-    // Use the out of core sort iterator to sort the batches by grouping key
-    outOfCoreIter = Some(GpuOutOfCoreSortIterator(
-      aggregatedBatchIter,
-      sorter,
-      configuredTargetBatchSize,
-      opTime = metrics.opTime,
-      sortTime = metrics.sortTime,
-      outputBatches = NoopMetric,
-      outputRows = NoopMetric))
-
-    // The out of core sort iterator does not guarantee that a batch contains all of the values
-    // for a particular key, so add a key batching iterator to enforce this. That allows each batch
-    // to be merge-aggregated safely since all values associated with a particular key are
-    // guaranteed to be in the same batch.
-    val keyBatchingIter = new GpuKeyBatchingIterator(
-      outOfCoreIter.get,
-      sorter,
-      aggBatchTypes.toArray,
-      configuredTargetBatchSize,
-      numInputRows = NoopMetric,
-      numInputBatches = NoopMetric,
-      numOutputRows = NoopMetric,
-      numOutputBatches = NoopMetric,
-      concatTime = metrics.concatTime,
-      opTime = metrics.opTime)
-
-    // Finally wrap the key batching iterator with a merge aggregation on the output batches.
-    new Iterator[ColumnarBatch] {
-      override def hasNext: Boolean = keyBatchingIter.hasNext
-
-      private val mergeSortedHelper =
-        new AggHelper(inputAttributes, groupingExpressions, aggregateExpressions,
-          forceMerge = true, conf, isSorted = true)
-
-      override def next(): ColumnarBatch = {
-        // batches coming out of the sort need to be merged
-        val resultSpillable =
-          computeAggregateAndClose(metrics, keyBatchingIter.next(), mergeSortedHelper)
-        withResource(resultSpillable) { _ =>
-          resultSpillable.getColumnarBatch()
-        }
-      }
-    }
+  /** Build an iterator merging aggregated batches in each bucket. */
+  private def buildBucketIterator(): Iterator[SpillableColumnarBatch] = {
+    bucketIter = Some(RepartitionAggregateIterator(opTime = metrics.opTime))
+    bucketIter.get
   }
 
+
   /**
    * Generates the result of a reduction-only aggregation on empty input by emitting the
    * initial value of each aggregator.
@@ -1117,13 +1183,13 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan](
     )
     if (arrayWithStructsGroupings) {
       willNotWorkOnGpu("ArrayTypes with Struct children in grouping expressions are not " +
-          "supported")
+        "supported")
     }
 
     tagForReplaceMode()
 
     if (agg.aggregateExpressions.exists(expr => expr.isDistinct)
-        && agg.aggregateExpressions.exists(expr => expr.filter.isDefined)) {
+      && agg.aggregateExpressions.exists(expr => expr.filter.isDefined)) {
       // Distinct with Filter is not supported on the GPU currently,
       // This makes sure that if we end up here, the plan falls back to the CPU
       // which will do the right thing.
@@ -1195,15 +1261,15 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan](
         // (due to First). Fall back to CPU in this case.
         if (AggregateUtils.shouldFallbackMultiDistinct(agg.aggregateExpressions)) {
           willNotWorkOnGpu("Aggregates of non-distinct functions with multiple distinct " +
-              "functions are non-deterministic for non-distinct functions as it is " +
-              "computed using First.")
+            "functions are non-deterministic for non-distinct functions as it is " +
+            "computed using First.")
         }
       }
     }
 
     if (!conf.partialMergeDistinctEnabled && aggPattern.contains(PartialMerge)) {
       willNotWorkOnGpu("Replacing Partial Merge aggregates disabled. " +
-          s"Set ${conf.partialMergeDistinctEnabled} to true if desired")
+        s"Set ${conf.partialMergeDistinctEnabled} to true if desired")
     }
   }
 
@@ -1256,11 +1322,11 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan](
     // This is a short term heuristic until we can better understand the cost
     // of sort vs the cost of doing aggregations so we can better decide.
     lazy val hasSingleBasicGroupingKey = agg.groupingExpressions.length == 1 &&
-        agg.groupingExpressions.headOption.map(_.dataType).exists {
-          case StringType | BooleanType | ByteType | ShortType | IntegerType |
-               LongType | _: DecimalType | DateType | TimestampType => true
-          case _ => false
-        }
+      agg.groupingExpressions.headOption.map(_.dataType).exists {
+        case StringType | BooleanType | ByteType | ShortType | IntegerType |
+             LongType | _: DecimalType | DateType | TimestampType => true
+        case _ => false
+      }
 
     val gpuChild = childPlans.head.convertIfNeeded()
     val gpuAggregateExpressions =
@@ -1314,11 +1380,11 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan](
     }
 
     val allowSinglePassAgg = (conf.forceSinglePassPartialSortAgg ||
-        (conf.allowSinglePassPartialSortAgg &&
-            hasSingleBasicGroupingKey &&
-            estimatedPreProcessGrowth > 1.1)) &&
-        canUsePartialSortAgg &&
-        groupingCanBeSorted
+      (conf.allowSinglePassPartialSortAgg &&
+        hasSingleBasicGroupingKey &&
+        estimatedPreProcessGrowth > 1.1)) &&
+      canUsePartialSortAgg &&
+      groupingCanBeSorted
 
     GpuHashAggregateExec(
       aggRequiredChildDistributionExpressions,
@@ -1332,7 +1398,8 @@ abstract class GpuBaseAggregateMeta[INPUT <: SparkPlan](
       conf.forceSinglePassPartialSortAgg,
       allowSinglePassAgg,
       allowNonFullyAggregatedOutput,
-      conf.skipAggPassReductionRatio)
+      conf.skipAggPassReductionRatio
+    )
   }
 }
 
@@ -1351,7 +1418,7 @@ abstract class GpuTypedImperativeSupportedAggregateExecMeta[INPUT <: BaseAggrega
   private val mayNeedAggBufferConversion: Boolean =
     agg.aggregateExpressions.exists { expr =>
       expr.aggregateFunction.isInstanceOf[TypedImperativeAggregate[_]] &&
-          (expr.mode == Partial || expr.mode == PartialMerge)
+        (expr.mode == Partial || expr.mode == PartialMerge)
     }
 
   // overriding data types of Aggregation Buffers if necessary
@@ -1420,6 +1487,7 @@ abstract class GpuTypedImperativeSupportedAggregateExecMeta[INPUT <: BaseAggrega
         allowSinglePassAgg = false,
         allowNonFullyAggregatedOutput = false,
         1)
+
     } else {
       super.convertToGpu()
     }
@@ -1523,8 +1591,8 @@ object GpuTypedImperativeSupportedAggregateExecMeta {
         // [A]. there will be a R2C or C2R transition between them
         // [B]. there exists TypedImperativeAggregate functions in each of them
         (stages(i).canThisBeReplaced ^ stages(i + 1).canThisBeReplaced) &&
-            containTypedImperativeAggregate(stages(i)) &&
-            containTypedImperativeAggregate(stages(i + 1))
+          containTypedImperativeAggregate(stages(i)) &&
+          containTypedImperativeAggregate(stages(i + 1))
     }
 
     // Return if all internal aggregation buffers are compatible with GPU Overrides.
@@ -1602,10 +1670,10 @@ object GpuTypedImperativeSupportedAggregateExecMeta {
       fromCpuToGpu: Boolean): Seq[NamedExpression] = {
 
     val converters = mutable.Queue[Either[
-        CpuToGpuAggregateBufferConverter, GpuToCpuAggregateBufferConverter]]()
+      CpuToGpuAggregateBufferConverter, GpuToCpuAggregateBufferConverter]]()
     mergeAggMeta.childExprs.foreach {
       case e if e.childExprs.length == 1 &&
-          e.childExprs.head.isInstanceOf[TypedImperativeAggExprMeta[_]] =>
+        e.childExprs.head.isInstanceOf[TypedImperativeAggExprMeta[_]] =>
         e.wrapped.asInstanceOf[AggregateExpression].mode match {
           case Final | PartialMerge =>
             val typImpAggMeta = e.childExprs.head.asInstanceOf[TypedImperativeAggExprMeta[_]]
@@ -1660,16 +1728,16 @@ class GpuHashAggregateMeta(
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends GpuBaseAggregateMeta(agg, agg.requiredChildDistributionExpressions,
-      conf, parent, rule)
+  extends GpuBaseAggregateMeta(agg, agg.requiredChildDistributionExpressions,
+    conf, parent, rule)
 
 class GpuSortAggregateExecMeta(
     override val agg: SortAggregateExec,
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends GpuTypedImperativeSupportedAggregateExecMeta(agg,
-      agg.requiredChildDistributionExpressions, conf, parent, rule) {
+  extends GpuTypedImperativeSupportedAggregateExecMeta(agg,
+    agg.requiredChildDistributionExpressions, conf, parent, rule) {
   override def tagPlanForGpu(): Unit = {
     super.tagPlanForGpu()
 
@@ -1716,14 +1784,14 @@ class GpuObjectHashAggregateExecMeta(
     conf: RapidsConf,
     parent: Option[RapidsMeta[_, _, _]],
     rule: DataFromReplacementRule)
-    extends GpuTypedImperativeSupportedAggregateExecMeta(agg,
-      agg.requiredChildDistributionExpressions, conf, parent, rule)
+  extends GpuTypedImperativeSupportedAggregateExecMeta(agg,
+    agg.requiredChildDistributionExpressions, conf, parent, rule)
 
 object GpuHashAggregateExecBase {
 
   def calcInputAttributes(aggregateExpressions: Seq[GpuAggregateExpression],
-                          childOutput: Seq[Attribute],
-                          inputAggBufferAttributes: Seq[Attribute]): Seq[Attribute] = {
+      childOutput: Seq[Attribute],
+      inputAggBufferAttributes: Seq[Attribute]): Seq[Attribute] = {
     val modes = aggregateExpressions.map(_.mode).distinct
     if (modes.contains(Final) || modes.contains(PartialMerge)) {
       // SPARK-31620: when planning aggregates, the partial aggregate uses aggregate function's
@@ -1754,7 +1822,7 @@ object GpuHashAggregateExecBase {
 }
 
 /**
- * The GPU version of SortAggregateExec that is intended for partial aggregations that are not
+ * The GPU version of AggregateExec that is intended for partial aggregations that are not
  * reductions and so it sorts the input data ahead of time to do it in a single pass.
  *
  * @param requiredChildDistributionExpressions this is unchanged by the GPU. It is used in
@@ -1767,7 +1835,6 @@ object GpuHashAggregateExecBase {
  *                          node should project)
  * @param child incoming plan (where we get input columns from)
  * @param configuredTargetBatchSize user-configured maximum device memory size of a batch
- * @param configuredTieredProjectEnabled configurable optimization to use tiered projections
  * @param allowNonFullyAggregatedOutput whether we can skip the third pass of aggregation
  *                                      (can omit non fully aggregated data for non-final
  *                                      stage of aggregation)
@@ -1802,11 +1869,13 @@ case class GpuHashAggregateExec(
   protected override val outputRowsLevel: MetricsLevel = ESSENTIAL_LEVEL
   protected override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
   override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    NUM_TASKS_FALL_BACKED -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_FALL_BACKED),
+    NUM_TASKS_REPARTITIONED -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_REPARTITIONED),
+    NUM_TASKS_SKIPPED_AGG -> createMetric(MODERATE_LEVEL, DESCRIPTION_NUM_TASKS_SKIPPED_AGG),
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
     AGG_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_AGG_TIME),
     CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME),
     SORT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_SORT_TIME),
+    REPARTITION_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_REPARTITION_TIME),
     "NUM_AGGS" -> createMetric(DEBUG_LEVEL, "num agg operations"),
     "NUM_PRE_SPLITS" -> createMetric(DEBUG_LEVEL, "num pre splits"),
     "NUM_TASKS_SINGLE_PASS" -> createMetric(MODERATE_LEVEL, "number of single pass tasks"),
@@ -1833,11 +1902,13 @@ case class GpuHashAggregateExec(
     val aggMetrics = GpuHashAggregateMetrics(
       numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS),
       numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES),
-      numTasksFallBacked = gpuLongMetric(NUM_TASKS_FALL_BACKED),
+      numTasksRepartitioned = gpuLongMetric(NUM_TASKS_REPARTITIONED),
+      numTasksSkippedAgg = gpuLongMetric(NUM_TASKS_SKIPPED_AGG),
       opTime = gpuLongMetric(OP_TIME),
       computeAggTime = gpuLongMetric(AGG_TIME),
       concatTime = gpuLongMetric(CONCAT_TIME),
       sortTime = gpuLongMetric(SORT_TIME),
+      repartitionTime = gpuLongMetric(REPARTITION_TIME),
       numAggOps = gpuLongMetric("NUM_AGGS"),
       numPreSplits = gpuLongMetric("NUM_PRE_SPLITS"),
       singlePassTasks = gpuLongMetric("NUM_TASKS_SINGLE_PASS"),
@@ -1867,11 +1938,12 @@ case class GpuHashAggregateExec(
       val postBoundReferences = GpuAggFinalPassIterator.setupReferences(groupingExprs,
         aggregateExprs, aggregateAttrs, resultExprs, modeInfo)
 
-      new DynamicGpuPartialSortAggregateIterator(cbIter, inputAttrs, groupingExprs,
+      new DynamicGpuPartialAggregateIterator(cbIter, inputAttrs, groupingExprs,
         boundGroupExprs, aggregateExprs, aggregateAttrs, resultExprs, modeInfo,
         localEstimatedPreProcessGrowth, alreadySorted, expectedOrdering,
         postBoundReferences, targetBatchSize, aggMetrics, conf,
-        localForcePre, localAllowPre, allowNonFullyAggregatedOutput, skipAggPassReductionRatio)
+        localForcePre, localAllowPre, allowNonFullyAggregatedOutput, skipAggPassReductionRatio
+      )
     }
   }
 
@@ -1914,8 +1986,8 @@ case class GpuHashAggregateExec(
   // Used in de-duping and optimizer rules
   override def producedAttributes: AttributeSet =
     AttributeSet(aggregateAttributes) ++
-        AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-        AttributeSet(aggregateBufferAttributes)
+      AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
+      AttributeSet(aggregateBufferAttributes)
 
   // AllTuples = distribution with a single partition and all tuples of the dataset are co-located.
   // Clustered = dataset with tuples co-located in the same partition if they share a specific value
@@ -1938,7 +2010,7 @@ case class GpuHashAggregateExec(
    */
   override lazy val allAttributes: AttributeSeq =
     child.output ++ aggregateBufferAttributes ++ aggregateAttributes ++
-        aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
 
   override def verboseString(maxFields: Int): String = toString(verbose = true, maxFields)
 
@@ -1957,8 +2029,8 @@ case class GpuHashAggregateExec(
         s"""${loreArgs.mkString(", ")}"""
     } else {
       s"$nodeName (keys=$keyString, functions=$functionString)," +
-          s" filters=${aggregateExpressions.map(_.filter)})" +
-          s""" ${loreArgs.mkString(", ")}"""
+        s" filters=${aggregateExpressions.map(_.filter)})" +
+        s""" ${loreArgs.mkString(", ")}"""
     }
   }
   //
@@ -1972,7 +2044,7 @@ case class GpuHashAggregateExec(
   }
 }
 
-class DynamicGpuPartialSortAggregateIterator(
+class DynamicGpuPartialAggregateIterator(
     cbIter: Iterator[ColumnarBatch],
     inputAttrs: Seq[Attribute],
     groupingExprs: Seq[NamedExpression],
@@ -1999,7 +2071,7 @@ class DynamicGpuPartialSortAggregateIterator(
   // When doing a reduction we don't have the aggIter setup for the very first time
   // so we have to match what happens for the normal reduction operations.
   override def hasNext: Boolean = aggIter.map(_.hasNext)
-      .getOrElse(isReductionOnly || cbIter.hasNext)
+    .getOrElse(isReductionOnly || cbIter.hasNext)
 
   private[this] def estimateCardinality(cb: ColumnarBatch): Int = {
     withResource(boundGroupExprs.project(cb)) { groupingKeys =>
@@ -2052,7 +2124,8 @@ class DynamicGpuPartialSortAggregateIterator(
       inputAttrs.map(_.dataType).toArray, preProcessAggHelper.preStepBound,
       metrics.opTime, metrics.numPreSplits)
 
-    val firstPassIter = GpuAggFirstPassIterator(sortedSplitIter, preProcessAggHelper, metrics)
+    val firstPassIter = GpuAggFirstPassIterator(sortedSplitIter, preProcessAggHelper,
+      metrics)
 
     // Technically on a partial-agg, which this only works for, this last iterator should
     // be a noop except for some metrics. But for consistency between all of the
@@ -2071,6 +2144,7 @@ class DynamicGpuPartialSortAggregateIterator(
       metrics.opTime, metrics.numPreSplits)
 
     val localInputRowsMetrics = new LocalGpuMetric
+
     val firstPassIter = GpuAggFirstPassIterator(
       splitInputIter.map(cb => {
         localInputRowsMetrics += cb.numRows()
@@ -2080,7 +2154,7 @@ class DynamicGpuPartialSortAggregateIterator(
       metrics)
 
     val mergeIter = new GpuMergeAggregateIterator(
-      firstPassIter,
+      new CloseableBufferedIterator(firstPassIter),
       inputAttrs,
       groupingExprs,
       aggregateExprs,
@@ -2092,7 +2166,8 @@ class DynamicGpuPartialSortAggregateIterator(
       conf,
       allowNonFullyAggregatedOutput,
       skipAggPassReductionRatio,
-      localInputRowsMetrics)
+      localInputRowsMetrics
+    )
 
     GpuAggFinalPassIterator.makeIter(mergeIter, postBoundReferences, metrics)
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
index 0ffead09de6..3d9b6285a91 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
@@ -66,6 +66,7 @@ object GpuMetric extends Logging {
   val COLLECT_TIME = "collectTime"
   val CONCAT_TIME = "concatTime"
   val SORT_TIME = "sortTime"
+  val REPARTITION_TIME = "repartitionTime"
   val AGG_TIME = "computeAggTime"
   val JOIN_TIME = "joinTime"
   val FILTER_TIME = "filterTime"
@@ -73,6 +74,8 @@ object GpuMetric extends Logging {
   val BUILD_TIME = "buildTime"
   val STREAM_TIME = "streamTime"
   val NUM_TASKS_FALL_BACKED = "numTasksFallBacked"
+  val NUM_TASKS_REPARTITIONED = "numTasksRepartitioned"
+  val NUM_TASKS_SKIPPED_AGG = "numTasksSkippedAgg"
   val READ_FS_TIME = "readFsTime"
   val WRITE_BUFFER_TIME = "writeBufferTime"
   val FILECACHE_FOOTER_HITS = "filecacheFooterHits"
@@ -104,6 +107,7 @@ object GpuMetric extends Logging {
   val DESCRIPTION_COLLECT_TIME = "collect batch time"
   val DESCRIPTION_CONCAT_TIME = "concat batch time"
   val DESCRIPTION_SORT_TIME = "sort time"
+  val DESCRIPTION_REPARTITION_TIME = "repartition time"
   val DESCRIPTION_AGG_TIME = "aggregation time"
   val DESCRIPTION_JOIN_TIME = "join time"
   val DESCRIPTION_FILTER_TIME = "filter time"
@@ -111,6 +115,8 @@ object GpuMetric extends Logging {
   val DESCRIPTION_BUILD_TIME = "build time"
   val DESCRIPTION_STREAM_TIME = "stream time"
   val DESCRIPTION_NUM_TASKS_FALL_BACKED = "number of sort fallback tasks"
+  val DESCRIPTION_NUM_TASKS_REPARTITIONED = "number of tasks repartitioned for agg"
+  val DESCRIPTION_NUM_TASKS_SKIPPED_AGG = "number of tasks skipped aggregation"
   val DESCRIPTION_READ_FS_TIME = "time to read fs data"
   val DESCRIPTION_WRITE_BUFFER_TIME = "time to write data to buffer"
   val DESCRIPTION_FILECACHE_FOOTER_HITS = "cached footer hits"
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala
index d685efe68e0..7c5b55cd0bd 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/window/GpuUnboundedToUnboundedAggWindowExec.scala
@@ -17,10 +17,9 @@
 package com.nvidia.spark.rapids.window
 
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
-import scala.reflect.ClassTag
 
 import ai.rapids.cudf
-import com.nvidia.spark.rapids.{ConcatAndConsumeAll, GpuAlias, GpuBindReferences, GpuBoundReference, GpuColumnVector, GpuExpression, GpuLiteral, GpuMetric, GpuProjectExec, SpillableColumnarBatch, SpillPriorities}
+import com.nvidia.spark.rapids.{AutoClosableArrayBuffer, ConcatAndConsumeAll, GpuAlias, GpuBindReferences, GpuBoundReference, GpuColumnVector, GpuExpression, GpuLiteral, GpuMetric, GpuProjectExec, SpillableColumnarBatch, SpillPriorities}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRetry, withRetryNoSplit}
@@ -36,32 +35,6 @@ import org.apache.spark.sql.rapids.aggregate.{CudfAggregate, GpuAggregateExpress
 import org.apache.spark.sql.types.{DataType, IntegerType, LongType}
 import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
-
-/**
- * Just a simple wrapper to make working with buffers of AutoClosable things play
- * nicely with withResource.
- */
-class AutoClosableArrayBuffer[T <: AutoCloseable]() extends AutoCloseable {
-  private val data = new ArrayBuffer[T]()
-
-  def append(scb: T): Unit = data.append(scb)
-
-  def last: T = data.last
-
-  def removeLast(): T = data.remove(data.length - 1)
-
-  def foreach[U](f: T => U): Unit = data.foreach(f)
-
-  def toArray[B >: T : ClassTag]: Array[B] = data.toArray
-
-  override def toString: String = s"AutoCloseable(${super.toString})"
-
-  override def close(): Unit = {
-    data.foreach(_.close())
-    data.clear()
-  }
-}
-
 /**
  * Utilities for conversion between SpillableColumnarBatch, ColumnarBatch, and cudf.Table.
  */

From 82c26f1de2cbdb13fa0d9e041baa4b738ca85d5e Mon Sep 17 00:00:00 2001
From: knoguchi22 <knoguchi@gmail.com>
Date: Tue, 26 Nov 2024 13:53:03 -0500
Subject: [PATCH 10/37] Append knoguchi22 to blossom-ci whitelist [skip ci]
 (#11777)

* Append knoguchi to blossom-ci whitelist [skip ci]

* Fixing the typo in username.

Signed-off-by: Koji Noguchi <knoguchi@nvidia.com>

---------

Signed-off-by: Koji Noguchi <knoguchi@nvidia.com>
---
 .github/workflows/blossom-ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 93557017b08..1d7b0ab8e0b 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -77,7 +77,8 @@ jobs:
         github.actor == 'Feng-Jiang28' ||
         github.actor == 'SurajAralihalli' ||
         github.actor == 'jihoonson' ||
-        github.actor == 'ustcfy'
+        github.actor == 'ustcfy' ||
+        github.actor == 'knoguchi22'
       )
     steps:
       - name: Check if comment is issued by authorized person

From ff0ca0f4d52b197a644c332b1ffcbe9c0351fb1f Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 26 Nov 2024 15:07:05 -0600
Subject: [PATCH 11/37] Ability to decompress snappy and zstd Parquet files via
 CPU [databricks] (#11752)

* Ability to decompress Parquet data on CPU

Signed-off-by: Jason Lowe <jlowe@nvidia.com>

* Add tests

* Refactor to reduce duplicated code

* scala2.13 fix

* Address review comments

* Fix Databricks build

* Update scala2.13 poms

---------

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 .../src/main/python/parquet_test.py           |   9 +-
 jenkins/databricks/install_deps.py            |   2 +
 scala2.13/shim-deps/databricks/pom.xml        |   6 +
 shim-deps/databricks/pom.xml                  |   6 +
 .../iceberg/parquet/GpuParquetReader.java     |   2 +
 .../spark/source/GpuMultiFileBatchReader.java |   6 +-
 .../nvidia/spark/rapids/GpuParquetScan.scala  | 376 ++++++++++++++++--
 .../spark/rapids/HostMemoryStreams.scala      |  12 +
 .../com/nvidia/spark/rapids/RapidsConf.scala  |  31 ++
 9 files changed, 406 insertions(+), 44 deletions(-)

diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index a223d6559ed..6aa234003ba 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -299,12 +299,19 @@ def test_parquet_read_round_trip_binary_as_string(std_input_path, read_func, rea
 @pytest.mark.parametrize('compress', parquet_compress_options)
 @pytest.mark.parametrize('reader_confs', reader_opt_confs)
 @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"])
-def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs):
+@pytest.mark.parametrize('cpu_decompress', [True, False])
+def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs, cpu_decompress):
     data_path = spark_tmp_path + '/PARQUET_DATA'
     with_cpu_session(
             lambda spark : binary_op_df(spark, long_gen).write.parquet(data_path),
             conf={'spark.sql.parquet.compression.codec': compress})
     all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
+    if cpu_decompress:
+        all_confs = copy_and_update(all_confs, {
+            'spark.rapids.sql.format.parquet.decompressCpu' : 'true',
+            'spark.rapids.sql.format.parquet.decompressCpu.snappy' : 'true',
+            'spark.rapids.sql.format.parquet.decompressCpu.zstd' : 'true'
+        })
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : spark.read.parquet(data_path),
             conf=all_confs)
diff --git a/jenkins/databricks/install_deps.py b/jenkins/databricks/install_deps.py
index 11e2162957e..23453912827 100644
--- a/jenkins/databricks/install_deps.py
+++ b/jenkins/databricks/install_deps.py
@@ -135,6 +135,8 @@ def define_deps(spark_version, scala_version):
                  f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro-mapred--org.apache.avro__avro-mapred__*.jar'),
         Artifact('org.apache.avro', 'avro',
                  f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro--org.apache.avro__avro__*.jar'),
+        Artifact('com.github.luben', 'zstd-jni',
+                 f'{prefix_ws_sp_mvn_hadoop}--com.github.luben--zstd-jni--com.github.luben__zstd-jni__*.jar'),
     ]
 
     # Parquet
diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml
index 9d6ff787ef1..484e2896f61 100644
--- a/scala2.13/shim-deps/databricks/pom.xml
+++ b/scala2.13/shim-deps/databricks/pom.xml
@@ -231,6 +231,12 @@
             <version>${spark.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.github.luben</groupId>
+            <artifactId>zstd-jni</artifactId>
+            <version>${spark.version}</version>
+            <scope>compile</scope>
+        </dependency>
         <dependency>
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-format</artifactId>
diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml
index edfa3d6f896..5f36e529aa7 100644
--- a/shim-deps/databricks/pom.xml
+++ b/shim-deps/databricks/pom.xml
@@ -231,6 +231,12 @@
             <version>${spark.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.github.luben</groupId>
+            <artifactId>zstd-jni</artifactId>
+            <version>${spark.version}</version>
+            <scope>compile</scope>
+        </dependency>
         <dependency>
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-format</artifactId>
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java
index 47b649af6ed..c61f7c6b6f7 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/parquet/GpuParquetReader.java
@@ -25,6 +25,7 @@
 
 import scala.collection.Seq;
 
+import com.nvidia.spark.rapids.CpuCompressionConfig$;
 import com.nvidia.spark.rapids.DateTimeRebaseCorrected$;
 import com.nvidia.spark.rapids.GpuMetric;
 import com.nvidia.spark.rapids.GpuParquetUtils;
@@ -144,6 +145,7 @@ public org.apache.iceberg.io.CloseableIterator<ColumnarBatch> iterator() {
           partReaderSparkSchema, debugDumpPrefix, debugDumpAlways,
           maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, useChunkedReader,
           maxChunkedReaderMemoryUsageSizeBytes,
+          CpuCompressionConfig$.MODULE$.disabled(),
           metrics,
           DateTimeRebaseCorrected$.MODULE$, // dateRebaseMode
           DateTimeRebaseCorrected$.MODULE$, // timestampRebaseMode
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
index 9c36fe76020..b32e5e755cb 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/iceberg/spark/source/GpuMultiFileBatchReader.java
@@ -352,7 +352,8 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
       return new MultiFileCloudParquetPartitionReader(conf, pFiles,
           this::filterParquetBlocks, caseSensitive, parquetDebugDumpPrefix, parquetDebugDumpAlways,
           maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes,
-          useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, metrics, partitionSchema,
+          useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
+          CpuCompressionConfig$.MODULE$.disabled(), metrics, partitionSchema,
           numThreads, maxNumFileProcessed,
           false, // ignoreMissingFiles
           false, // ignoreCorruptFiles
@@ -411,7 +412,7 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
           JavaConverters.asJavaCollection(filteredInfo.parquetBlockMeta.blocks()).stream()
             .map(b -> ParquetSingleDataBlockMeta.apply(
                 filteredInfo.parquetBlockMeta.filePath(),
-                ParquetDataBlock.apply(b),
+                ParquetDataBlock.apply(b, CpuCompressionConfig$.MODULE$.disabled()),
                 InternalRow.empty(),
                 ParquetSchemaWrapper.apply(filteredInfo.parquetBlockMeta.schema()),
                 filteredInfo.parquetBlockMeta.readSchema(),
@@ -431,6 +432,7 @@ protected FilePartitionReaderBase createRapidsReader(PartitionedFile[] pFiles,
           caseSensitive, parquetDebugDumpPrefix, parquetDebugDumpAlways,
           maxBatchSizeRows, maxBatchSizeBytes, targetBatchSizeBytes, maxGpuColumnSizeBytes,
           useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
+          CpuCompressionConfig$.MODULE$.disabled(),
           metrics, partitionSchema, numThreads,
           false, // ignoreMissingFiles
           false, // ignoreCorruptFiles
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
index e38dab50d72..03eb48de6fb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids
 
-import java.io.{Closeable, EOFException, FileNotFoundException, IOException, OutputStream}
+import java.io.{Closeable, EOFException, FileNotFoundException, InputStream, IOException, OutputStream}
 import java.net.URI
 import java.nio.ByteBuffer
 import java.nio.channels.SeekableByteChannel
@@ -31,6 +31,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.language.implicitConversions
 
 import ai.rapids.cudf._
+import com.github.luben.zstd.ZstdDecompressCtx
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.ParquetPartitionReader.{CopyRange, LocalCopy}
@@ -47,6 +48,7 @@ import org.apache.parquet.bytes.BytesUtils
 import org.apache.parquet.bytes.BytesUtils.readIntLittleEndian
 import org.apache.parquet.column.ColumnDescriptor
 import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.format.Util
 import org.apache.parquet.format.converter.ParquetMetadataConverter
 import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat}
 import org.apache.parquet.hadoop.ParquetFileWriter.MAGIC
@@ -54,6 +56,7 @@ import org.apache.parquet.hadoop.metadata._
 import org.apache.parquet.io.{InputFile, SeekableInputStream}
 import org.apache.parquet.schema.{DecimalMetadata, GroupType, MessageType, OriginalType, PrimitiveType, Type}
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
+import org.xerial.snappy.Snappy
 
 import org.apache.spark.TaskContext
 import org.apache.spark.broadcast.Broadcast
@@ -1106,6 +1109,7 @@ case class GpuParquetMultiFilePartitionReaderFactory(
       }.getOrElse(rapidsConf.getMultithreadedReaderKeepOrder)
   private val alluxioReplacementTaskTime =
     AlluxioCfgUtils.enabledAlluxioReplacementAlgoTaskTime(rapidsConf)
+  private val compressCfg = CpuCompressionConfig.forParquet(rapidsConf)
 
   // We can't use the coalescing files reader when InputFileName, InputFileBlockStart,
   // or InputFileBlockLength because we are combining all the files into a single buffer
@@ -1137,7 +1141,7 @@ case class GpuParquetMultiFilePartitionReaderFactory(
     new MultiFileCloudParquetPartitionReader(conf, files, filterFunc, isCaseSensitive,
       debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows, maxReadBatchSizeBytes,
       targetBatchSizeBytes, maxGpuColumnSizeBytes,
-      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
+      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg,
       metrics, partitionSchema, numThreads, maxNumFileProcessed, ignoreMissingFiles,
       ignoreCorruptFiles, readUseFieldId, alluxioPathReplacementMap.getOrElse(Map.empty),
       alluxioReplacementTaskTime, queryUsesInputFile, keepReadsInOrderFromConf, combineConf)
@@ -1244,7 +1248,7 @@ case class GpuParquetMultiFilePartitionReaderFactory(
       clippedBlocks ++= singleFileInfo.blocks.map(block =>
         ParquetSingleDataBlockMeta(
           singleFileInfo.filePath,
-          ParquetDataBlock(block),
+          ParquetDataBlock(block, compressCfg),
           metaAndFile.file.partitionValues,
           ParquetSchemaWrapper(singleFileInfo.schema),
           singleFileInfo.readSchema,
@@ -1262,7 +1266,7 @@ case class GpuParquetMultiFilePartitionReaderFactory(
     new MultiFileParquetPartitionReader(conf, files, clippedBlocks.toSeq, isCaseSensitive,
       debugDumpPrefix, debugDumpAlways, maxReadBatchSizeRows, maxReadBatchSizeBytes,
       targetBatchSizeBytes, maxGpuColumnSizeBytes,
-      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
+      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg,
       metrics, partitionSchema, numThreads, ignoreMissingFiles, ignoreCorruptFiles, readUseFieldId)
   }
 
@@ -1307,6 +1311,7 @@ case class GpuParquetPartitionReaderFactory(
   private val readUseFieldId = ParquetSchemaClipShims.useFieldId(sqlConf)
   private val footerReadType = GpuParquetScan.footerReaderHeuristic(
     rapidsConf.parquetReaderFooterType, dataSchema, readDataSchema, readUseFieldId)
+  private val compressCfg = CpuCompressionConfig.forParquet(rapidsConf)
 
   override def supportColumnarReads(partition: InputPartition): Boolean = true
 
@@ -1335,12 +1340,29 @@ case class GpuParquetPartitionReaderFactory(
     new ParquetPartitionReader(conf, file, singleFileInfo.filePath, singleFileInfo.blocks,
       singleFileInfo.schema, isCaseSensitive, readDataSchema, debugDumpPrefix, debugDumpAlways,
       maxReadBatchSizeRows, maxReadBatchSizeBytes, targetSizeBytes,
-      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes,
+      useChunkedReader, maxChunkedReaderMemoryUsageSizeBytes, compressCfg,
       metrics, singleFileInfo.dateRebaseMode,
       singleFileInfo.timestampRebaseMode, singleFileInfo.hasInt96Timestamps, readUseFieldId)
   }
 }
 
+case class CpuCompressionConfig(
+    decompressSnappyCpu: Boolean,
+    decompressZstdCpu: Boolean) {
+  val decompressAnyCpu: Boolean = decompressSnappyCpu || decompressZstdCpu
+}
+
+object CpuCompressionConfig {
+  def forParquet(conf: RapidsConf): CpuCompressionConfig = {
+    val cpuEnable = conf.parquetDecompressCpu
+    CpuCompressionConfig(
+      decompressSnappyCpu = cpuEnable && conf.parquetDecompressCpuSnappy,
+      decompressZstdCpu = cpuEnable && conf.parquetDecompressCpuZstd)
+  }
+
+  def disabled(): CpuCompressionConfig = CpuCompressionConfig(false, false)
+}
+
 trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
     with MultiFileReaderFunctions {
   // the size of Parquet magic (at start+end) and footer length values
@@ -1353,6 +1375,8 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
 
   def isSchemaCaseSensitive: Boolean
 
+  def compressCfg: CpuCompressionConfig
+
   val copyBufferSize = conf.getInt("parquet.read.allocation.size", 8 * 1024 * 1024)
 
   def checkIfNeedToSplitBlocks(currentDateRebaseMode: DateTimeRebaseMode,
@@ -1418,13 +1442,8 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
       schema: MessageType,
       handleCoalesceFiles: Boolean): Long = {
     // start with the size of Parquet magic (at start+end) and footer length values
-    var size: Long = PARQUET_META_SIZE
-
-    // Calculate the total amount of column data that will be copied
-    // NOTE: Avoid using block.getTotalByteSize here as that is the
-    //       uncompressed size rather than the size in the file.
-    size += currentChunkedBlocks.flatMap(_.getColumns.asScala.map(_.getTotalSize)).sum
-
+    val headerSize: Long = PARQUET_META_SIZE
+    val blocksSize = ParquetPartitionReader.computeOutputSize(currentChunkedBlocks, compressCfg)
     val footerSize = calculateParquetFooterSize(currentChunkedBlocks, schema)
     val extraMemory = if (handleCoalesceFiles) {
       val numCols = currentChunkedBlocks.head.getColumns().size()
@@ -1432,8 +1451,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
     } else {
       0
     }
-    val totalSize = size + footerSize + extraMemory
-    totalSize
+    headerSize + blocksSize + footerSize + extraMemory
   }
 
   protected def writeFooter(
@@ -1532,7 +1550,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
    * metadata but with the file offsets updated to reflect the new position of the column data
    * as written to the output.
    *
-   * @param in  the input stream for the original Parquet file
+   * @param filePath the path to the Parquet file
    * @param out the output stream to receive the data
    * @param blocks block metadata from the original file that will appear in the computed file
    * @param realStartOffset starting file offset of the first block
@@ -1575,6 +1593,258 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
     computeBlockMetaData(blocks, realStartOffset)
   }
 
+  private class BufferedFileInput(
+      filePath: Path,
+      blocks: Seq[BlockMetaData],
+      metrics: Map[String, GpuMetric]) extends InputStream {
+    private[this] val in = filePath.getFileSystem(conf).open(filePath)
+    private[this] val buffer: Array[Byte] = new Array[Byte](copyBufferSize)
+    private[this] var bufferSize: Int = 0
+    private[this] var bufferFilePos: Long = in.getPos
+    private[this] var bufferPos: Int = 0
+    private[this] val columnIter = blocks.flatMap(_.getColumns.asScala).iterator
+    private[this] var currentColumn: Option[ColumnChunkMetaData] = None
+    private[this] val readTime: GpuMetric = metrics.getOrElse(READ_FS_TIME, NoopMetric)
+
+    override def read(): Int = {
+      while (bufferPos == bufferSize) {
+        fillBuffer()
+      }
+      val result = buffer(bufferPos)
+      bufferPos += 1
+      result
+    }
+
+    override def read(b: Array[Byte]): Int = read(b, 0, b.length)
+
+    override def read(dest: Array[Byte], off: Int, len: Int): Int = {
+      var bytesLeft = len
+      while (bytesLeft > 0) {
+        if (bufferPos == bufferSize) {
+          fillBuffer()
+        }
+        val numBytes = Math.min(bytesLeft, bufferSize - bufferPos)
+        System.arraycopy(buffer, bufferPos, dest,  off + len - bytesLeft, numBytes)
+        bufferPos += numBytes
+        bytesLeft -= numBytes
+      }
+      len
+    }
+
+    def read(out: HostMemoryOutputStream, len: Long): Unit = {
+      var bytesLeft = len
+      while (bytesLeft > 0) {
+        if (bufferPos == bufferSize) {
+          fillBuffer()
+        }
+        // downcast is safe because bufferSize is an int
+        val numBytes = Math.min(bytesLeft, bufferSize - bufferPos).toInt
+        out.write(buffer, bufferPos, numBytes)
+        bufferPos += numBytes
+        bytesLeft -= numBytes
+      }
+    }
+
+    def read(out: HostMemoryBuffer, len: Long): Unit = {
+      var bytesLeft = len
+      while (bytesLeft > 0) {
+        if (bufferPos == bufferSize) {
+          fillBuffer()
+        }
+        // downcast is safe because bufferSize is an int
+        val numBytes = Math.min(bytesLeft, bufferSize - bufferPos).toInt
+        out.setBytes(len - bytesLeft, buffer, bufferPos, numBytes)
+        bufferPos += numBytes
+        bytesLeft -= numBytes
+      }
+    }
+
+    override def skip(n: Long): Long = {
+      seek(getPos + n)
+      n
+    }
+
+    def getPos: Long = bufferFilePos + bufferPos
+
+    def seek(desiredPos: Long): Unit = {
+      require(desiredPos >= getPos, "Only supports seeking forward")
+      val posDiff = desiredPos - bufferFilePos
+      if (posDiff >= 0 && posDiff < bufferSize) {
+        bufferPos = posDiff.toInt
+      } else {
+        in.seek(desiredPos)
+        bufferFilePos = desiredPos
+        bufferSize = 0
+        bufferPos = 0
+      }
+    }
+
+    override def close(): Unit = {
+      readTime.ns {
+        in.close()
+      }
+    }
+
+    private def fillBuffer(): Unit = {
+      // TODO: Add FileCache support https://github.com/NVIDIA/spark-rapids/issues/11775
+      var bytesToCopy = currentColumn.map { c =>
+        Math.max(0, c.getStartingPos + c.getTotalSize - getPos)
+      }.getOrElse(0L)
+      var done = bytesToCopy >= buffer.length
+      while (!done && columnIter.hasNext) {
+        val column = columnIter.next()
+        currentColumn = Some(column)
+        done = if (getPos + bytesToCopy == column.getStartingPos) {
+          bytesToCopy += column.getTotalSize
+          bytesToCopy >= buffer.length
+        } else {
+          true
+        }
+      }
+      if (bytesToCopy <= 0) {
+        throw new EOFException("read beyond column data range")
+      }
+      bufferFilePos = in.getPos
+      bufferPos = 0
+      bufferSize = Math.min(bytesToCopy, buffer.length).toInt
+      readTime.ns {
+        in.readFully(buffer, 0, bufferSize)
+      }
+    }
+  }
+
+  /**
+   * Copies the data corresponding to the clipped blocks in the original file and compute the
+   * block metadata for the output. The output blocks will contain the same column chunk
+   * metadata but with the file offsets updated to reflect the new position of the column data
+   * as written to the output.
+   *
+   * @param filePath the path to the Parquet file
+   * @param out the output stream to receive the data
+   * @param blocks block metadata from the original file that will appear in the computed file
+   * @param realStartOffset starting file offset of the first block
+   * @return updated block metadata corresponding to the output
+   */
+  protected def copyAndUncompressBlocksData(
+      filePath: Path,
+      out: HostMemoryOutputStream,
+      blocks: Seq[BlockMetaData],
+      realStartOffset: Long,
+      metrics: Map[String, GpuMetric],
+      compressCfg: CpuCompressionConfig): Seq[BlockMetaData] = {
+    val outStartPos = out.getPos
+    val writeTime = metrics.getOrElse(WRITE_BUFFER_TIME, NoopMetric)
+    withResource(new BufferedFileInput(filePath, blocks, metrics)) { in =>
+      val newBlocks = blocks.map { block =>
+        val newColumns = block.getColumns.asScala.map { column =>
+          var columnTotalSize = column.getTotalSize
+          var columnCodec = column.getCodec
+          val columnStartingPos = realStartOffset + out.getPos - outStartPos
+          val columnDictOffset = if (column.getDictionaryPageOffset > 0) {
+            column.getDictionaryPageOffset + columnStartingPos - column.getStartingPos
+          } else {
+            0
+          }
+          writeTime.ns {
+            columnCodec match {
+              case CompressionCodecName.SNAPPY if compressCfg.decompressSnappyCpu =>
+                val columnStartPos = out.getPos
+                decompressSnappy(in, out, column)
+                columnCodec = CompressionCodecName.UNCOMPRESSED
+                columnTotalSize = out.getPos - columnStartPos
+              case CompressionCodecName.ZSTD if compressCfg.decompressZstdCpu =>
+                val columnStartPos = out.getPos
+                decompressZstd(in, out, column)
+                columnCodec = CompressionCodecName.UNCOMPRESSED
+                columnTotalSize = out.getPos - columnStartPos
+              case _ =>
+                in.seek(column.getStartingPos)
+                in.read(out, columnTotalSize)
+            }
+          }
+          ColumnChunkMetaData.get(
+            column.getPath,
+            column.getPrimitiveType,
+            columnCodec,
+            column.getEncodingStats,
+            column.getEncodings,
+            column.getStatistics,
+            columnStartingPos,
+            columnDictOffset,
+            column.getValueCount,
+            columnTotalSize,
+            columnTotalSize)
+        }
+        GpuParquetUtils.newBlockMeta(block.getRowCount, newColumns.toSeq)
+      }
+      newBlocks
+    }
+  }
+
+  private def decompressSnappy(
+      in: BufferedFileInput,
+      out: HostMemoryOutputStream,
+      column: ColumnChunkMetaData): Unit = {
+    val endPos = column.getStartingPos + column.getTotalSize
+    in.seek(column.getStartingPos)
+    var inData: Option[HostMemoryBuffer] = None
+    try {
+      while (in.getPos != endPos) {
+        val pageHeader = Util.readPageHeader(in)
+        val compressedSize = pageHeader.getCompressed_page_size
+        val uncompressedSize = pageHeader.getUncompressed_page_size
+        pageHeader.unsetCrc()
+        pageHeader.setCompressed_page_size(uncompressedSize)
+        Util.writePageHeader(pageHeader, out)
+        if (inData.map(_.getLength).getOrElse(0L) < compressedSize) {
+          inData.foreach(_.close())
+          inData = Some(HostMemoryBuffer.allocate(compressedSize, false))
+        }
+        inData.foreach { compressedBuffer =>
+          in.read(compressedBuffer, compressedSize)
+          val bbIn = compressedBuffer.asByteBuffer(0, compressedSize)
+          val bbOut = out.writeAsByteBuffer(uncompressedSize)
+          Snappy.uncompress(bbIn, bbOut)
+        }
+      }
+    } finally {
+      inData.foreach(_.close())
+    }
+  }
+
+  private def decompressZstd(
+      in: BufferedFileInput,
+      out: HostMemoryOutputStream,
+      column: ColumnChunkMetaData): Unit = {
+    val endPos = column.getStartingPos + column.getTotalSize
+    in.seek(column.getStartingPos)
+    var inData: Option[HostMemoryBuffer] = None
+    try {
+      withResource(new ZstdDecompressCtx()) { ctx =>
+        while (in.getPos != endPos) {
+          val pageHeader = Util.readPageHeader(in)
+          val compressedSize = pageHeader.getCompressed_page_size
+          val uncompressedSize = pageHeader.getUncompressed_page_size
+          pageHeader.unsetCrc()
+          pageHeader.setCompressed_page_size(uncompressedSize)
+          Util.writePageHeader(pageHeader, out)
+          if (inData.map(_.getLength).getOrElse(0L) < compressedSize) {
+            inData.foreach(_.close())
+            inData = Some(HostMemoryBuffer.allocate(compressedSize, false))
+          }
+          inData.foreach { compressedBuffer =>
+            in.read(compressedBuffer, compressedSize)
+            val bbIn = compressedBuffer.asByteBuffer(0, compressedSize)
+            val bbOut = out.writeAsByteBuffer(uncompressedSize)
+            ctx.decompress(bbOut, bbIn)
+          }
+        }
+      }
+    } finally {
+      inData.foreach(_.close())
+    }
+  }
+
   private def copyRemoteBlocksData(
       remoteCopies: Seq[CopyRange],
       filePath: Path,
@@ -1666,7 +1936,11 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
       closeOnExcept(HostMemoryBuffer.allocate(estTotalSize)) { hmb =>
         val out = new HostMemoryOutputStream(hmb)
         out.write(ParquetPartitionReader.PARQUET_MAGIC)
-        val outputBlocks = copyBlocksData(filePath, out, blocks, out.getPos, metrics)
+        val outputBlocks = if (compressCfg.decompressAnyCpu) {
+          copyAndUncompressBlocksData(filePath, out, blocks, out.getPos, metrics, compressCfg)
+        } else {
+          copyBlocksData(filePath, out, blocks, out.getPos, metrics)
+        }
         val footerPos = out.getPos
         writeFooter(out, outputBlocks, clippedSchema)
         BytesUtils.writeIntLittleEndian(out, (out.getPos - footerPos).toInt)
@@ -1802,7 +2076,7 @@ trait ParquetPartitionReaderBase extends Logging with ScanWithMetrics
     block.asInstanceOf[ParquetDataBlock].dataBlock
 
   implicit def toDataBlockBase(blocks: Seq[BlockMetaData]): Seq[DataBlockBase] =
-    blocks.map(ParquetDataBlock)
+    blocks.map(b => ParquetDataBlock(b, compressCfg))
 
   implicit def toBlockMetaDataSeq(blocks: Seq[DataBlockBase]): Seq[BlockMetaData] =
     blocks.map(_.asInstanceOf[ParquetDataBlock].dataBlock)
@@ -1814,10 +2088,14 @@ private case class ParquetSchemaWrapper(schema: MessageType) extends SchemaBase
 }
 
 // Parquet BlockMetaData wrapper
-private case class ParquetDataBlock(dataBlock: BlockMetaData) extends DataBlockBase {
+private case class ParquetDataBlock(
+    dataBlock: BlockMetaData,
+    compressCfg: CpuCompressionConfig) extends DataBlockBase {
   override def getRowCount: Long = dataBlock.getRowCount
   override def getReadDataSize: Long = dataBlock.getTotalByteSize
-  override def getBlockSize: Long = dataBlock.getColumns.asScala.map(_.getTotalSize).sum
+  override def getBlockSize: Long = {
+    ParquetPartitionReader.computeOutputSize(dataBlock, compressCfg)
+  }
 }
 
 /** Parquet extra information containing rebase modes and whether there is int96 timestamp */
@@ -1876,6 +2154,7 @@ class MultiFileParquetPartitionReader(
     maxGpuColumnSizeBytes: Long,
     useChunkedReader: Boolean,
     maxChunkedReaderMemoryUsageSizeBytes: Long,
+    override val compressCfg: CpuCompressionConfig,
     override val execMetrics: Map[String, GpuMetric],
     partitionSchema: StructType,
     numThreads: Int,
@@ -1900,7 +2179,8 @@ class MultiFileParquetPartitionReader(
       file: Path,
       outhmb: HostMemoryBuffer,
       blocks: ArrayBuffer[DataBlockBase],
-      offset: Long)
+      offset: Long,
+      compressCfg: CpuCompressionConfig)
     extends Callable[(Seq[DataBlockBase], Long)] {
 
     override def call(): (Seq[DataBlockBase], Long) = {
@@ -1909,7 +2189,11 @@ class MultiFileParquetPartitionReader(
         val startBytesRead = fileSystemBytesRead()
         val outputBlocks = withResource(outhmb) { _ =>
           withResource(new HostMemoryOutputStream(outhmb)) { out =>
-            copyBlocksData(file, out, blocks.toSeq, offset, metrics)
+            if (compressCfg.decompressAnyCpu) {
+              copyAndUncompressBlocksData(file, out, blocks.toSeq, offset, metrics, compressCfg)
+            } else {
+              copyBlocksData(file, out, blocks.toSeq, offset, metrics)
+            }
           }
         }
         val bytesRead = fileSystemBytesRead() - startBytesRead
@@ -1961,7 +2245,7 @@ class MultiFileParquetPartitionReader(
       blocks: ArrayBuffer[DataBlockBase],
       offset: Long,
       batchContext: BatchContext): Callable[(Seq[DataBlockBase], Long)] = {
-    new ParquetCopyBlocksRunner(taskContext, file, outhmb, blocks, offset)
+    new ParquetCopyBlocksRunner(taskContext, file, outhmb, blocks, offset, compressCfg)
   }
 
   override final def getFileFormatShortName: String = "Parquet"
@@ -2072,6 +2356,7 @@ class MultiFileCloudParquetPartitionReader(
     maxGpuColumnSizeBytes: Long,
     useChunkedReader: Boolean,
     maxChunkedReaderMemoryUsageSizeBytes: Long,
+    override val compressCfg: CpuCompressionConfig,
     override val execMetrics: Map[String, GpuMetric],
     partitionSchema: StructType,
     numThreads: Int,
@@ -2761,6 +3046,7 @@ class ParquetPartitionReader(
     targetBatchSizeBytes: Long,
     useChunkedReader: Boolean,
     maxChunkedReaderMemoryUsageSizeBytes: Long,
+    override val compressCfg: CpuCompressionConfig,
     override val execMetrics: Map[String, GpuMetric],
     dateRebaseMode: DateTimeRebaseMode,
     timestampRebaseMode: DateTimeRebaseMode,
@@ -2873,26 +3159,34 @@ object ParquetPartitionReader {
       length: Long,
       outputOffset: Long) extends CopyItem
 
-  /**
-   * Build a new BlockMetaData
-   *
-   * @param rowCount the number of rows in this block
-   * @param columns the new column chunks to reference in the new BlockMetaData
-   * @return the new BlockMetaData
-   */
-  private[rapids] def newParquetBlock(
-      rowCount: Long,
-      columns: Seq[ColumnChunkMetaData]): BlockMetaData = {
-    val block = new BlockMetaData
-    block.setRowCount(rowCount)
+  private[rapids] def computeOutputSize(
+      blocks: Seq[BlockMetaData],
+      compressCfg: CpuCompressionConfig): Long = {
+    blocks.map { block =>
+      computeOutputSize(block, compressCfg)
+    }.sum
+  }
 
-    var totalSize: Long = 0
-    columns.foreach { column =>
-      block.addColumn(column)
-      totalSize += column.getTotalUncompressedSize
+  private[rapids] def computeOutputSize(
+      block: BlockMetaData,
+      compressCfg: CpuCompressionConfig): Long = {
+    if (compressCfg.decompressAnyCpu) {
+      block.getColumns.asScala.map { c =>
+        if ((c.getCodec == CompressionCodecName.SNAPPY && compressCfg.decompressSnappyCpu)
+            || (c.getCodec == CompressionCodecName.ZSTD && compressCfg.decompressZstdCpu)) {
+          // Page headers need to be rewritten when CPU decompresses, and that may
+          // increase the size of the page header. Guess how many pages there may be
+          // and add a fudge factor per page to try to avoid a late realloc+copy.
+          // NOTE: Avoid using block.getTotalByteSize as that is the
+          //       uncompressed size rather than the size in the file.
+          val estimatedPageCount = (c.getTotalUncompressedSize / (1024 * 1024)) + 1
+          c.getTotalUncompressedSize + estimatedPageCount * 8
+        } else {
+          c.getTotalSize
+        }
+      }.sum
+    } else {
+      block.getColumns.asScala.map(_.getTotalSize).sum
     }
-    block.setTotalByteSize(totalSize)
-
-    block
   }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala
index 08fe5be50b2..4be11b13254 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostMemoryStreams.scala
@@ -54,6 +54,12 @@ class HostMemoryOutputStream(val buffer: HostMemoryBuffer) extends OutputStream
     pos += numBytes
   }
 
+  def writeAsByteBuffer(length: Int): ByteBuffer = {
+    val bb = buffer.asByteBuffer(pos, length)
+    pos += length
+    bb
+  }
+
   def getPos: Long = pos
 
   def seek(newPos: Long): Unit = {
@@ -132,6 +138,12 @@ trait HostMemoryInputStreamMixIn extends InputStream {
     }
   }
 
+  def readByteBuffer(length: Int): ByteBuffer = {
+    val bb = hmb.asByteBuffer(pos, length)
+    pos += length
+    bb
+  }
+
   override def skip(count: Long): Long = {
     val oldPos = pos
     pos = Math.min(pos + count, hmbLength)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index ab7a788d205..406aeb0365b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1120,6 +1120,31 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .checkValues(RapidsReaderType.values.map(_.toString))
     .createWithDefault(RapidsReaderType.AUTO.toString)
 
+  val PARQUET_DECOMPRESS_CPU =
+    conf("spark.rapids.sql.format.parquet.decompressCpu")
+      .doc("If true then the CPU is eligible to decompress Parquet data rather than the GPU. " +
+          s"See other spark.rapids.sql.format.parquet.decompressCpu.* configuration settings " +
+          "to control this for specific compression codecs.")
+      .internal()
+      .booleanConf
+      .createWithDefault(false)
+
+  val PARQUET_DECOMPRESS_CPU_SNAPPY =
+    conf("spark.rapids.sql.format.parquet.decompressCpu.snappy")
+      .doc(s"If true and $PARQUET_DECOMPRESS_CPU is true then the CPU decompresses " +
+          "Parquet Snappy data rather than the GPU")
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
+
+  val PARQUET_DECOMPRESS_CPU_ZSTD =
+    conf("spark.rapids.sql.format.parquet.decompressCpu.zstd")
+      .doc(s"If true and $PARQUET_DECOMPRESS_CPU is true then the CPU decompresses " +
+          "Parquet Zstandard data rather than the GPU")
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
+
   val READER_MULTITHREADED_COMBINE_THRESHOLD =
     conf("spark.rapids.sql.reader.multithreaded.combine.sizeBytes")
       .doc("The target size in bytes to combine multiple small files together when using the " +
@@ -2960,6 +2985,12 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
   lazy val isParquetMultiThreadReadEnabled: Boolean = isParquetAutoReaderEnabled ||
     RapidsReaderType.withName(get(PARQUET_READER_TYPE)) == RapidsReaderType.MULTITHREADED
 
+  lazy val parquetDecompressCpu: Boolean = get(PARQUET_DECOMPRESS_CPU)
+
+  lazy val parquetDecompressCpuSnappy: Boolean = get(PARQUET_DECOMPRESS_CPU_SNAPPY)
+
+  lazy val parquetDecompressCpuZstd: Boolean = get(PARQUET_DECOMPRESS_CPU_ZSTD)
+
   lazy val maxNumParquetFilesParallel: Int = get(PARQUET_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
 
   lazy val isParquetReadEnabled: Boolean = get(ENABLE_PARQUET_READ)

From ed02cfe4f54e3c8531017671fd6ad0388128cb75 Mon Sep 17 00:00:00 2001
From: MithunR <mithunr@nvidia.com>
Date: Tue, 26 Nov 2024 14:02:54 -0800
Subject: [PATCH 12/37] Fix `dpp_test.py` failures on [databricks] 14.3
 (#11768)

Fixes #11536.

This commit fixes the tests in `dpp_test.py` that were failing on
Databricks 14.3.

The failures were largely a result of an erroneous shim implementation,
that was fixed as part of #11750.

This commit accounts for the remaining failures that result from there
being a `CollectLimitExec` in certain DPP query plans (that include
broadcast joins, for example).  The tests have been made more
permissive, in allowing the `CollectLimitExec` to run on the CPU.

The `CollectLimitExec` based plans will be further explored as part of
https://github.com/NVIDIA/spark-rapids/issues/11764.

Signed-off-by: MithunR <mithunr@nvidia.com>
---
 integration_tests/src/main/python/dpp_test.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py
index b362a4175f3..3d5ee1a5afa 100644
--- a/integration_tests/src/main/python/dpp_test.py
+++ b/integration_tests/src/main/python/dpp_test.py
@@ -20,7 +20,7 @@
 from conftest import spark_tmp_table_factory
 from data_gen import *
 from marks import ignore_order, allow_non_gpu, datagen_overrides, disable_ansi_mode
-from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later
+from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later, is_databricks_version_or_later
 
 # non-positive values here can produce a degenerative join, so here we ensure that most values are
 # positive to ensure the join will produce rows. See https://github.com/NVIDIA/spark-rapids/issues/10147
@@ -167,10 +167,17 @@ def fn(spark):
     '''
 ]
 
+# On some Databricks versions (>=14.3), some query plans include a `CollectLimitExec`,
+# when filtering partitions.  This exec falls back to CPU.  These tests allow for `CollectLimit` to
+# run on the CPU, if everything else in the plan execute as expected.
+# Further details are furnished at https://github.com/NVIDIA/spark-rapids/issues/11764.
+dpp_fallback_execs=["CollectLimitExec"] if is_databricks_version_or_later(14,3) else []
+
 @disable_ansi_mode  # https://github.com/NVIDIA/spark-rapids/issues/5114
 # When BroadcastExchangeExec is available on filtering side, and it can be reused:
 # DynamicPruningExpression(InSubqueryExec(value, GpuSubqueryBroadcastExec)))
 @ignore_order
+@allow_non_gpu(*dpp_fallback_execs)
 @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10147")
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
@@ -245,6 +252,7 @@ def test_dpp_bypass(spark_tmp_table_factory, store_format, s_index, aqe_enabled)
 # then Spark will plan an extra Aggregate to collect filtering values:
 # DynamicPruningExpression(InSubqueryExec(value, SubqueryExec(Aggregate(...))))
 @ignore_order
+@allow_non_gpu(*dpp_fallback_execs)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn)
 @pytest.mark.parametrize('aqe_enabled', [
@@ -285,10 +293,11 @@ def test_dpp_skip(spark_tmp_table_factory, store_format, s_index, aqe_enabled):
         non_exist_classes='DynamicPruningExpression',
         conf=dict(_dpp_fallback_conf + [('spark.sql.adaptive.enabled', aqe_enabled)]))
 
+dpp_like_any_fallback_execs=['FilterExec', 'CollectLimitExec'] if is_databricks_version_or_later(14,3) else ['FilterExec']
 
 # GPU verification on https://issues.apache.org/jira/browse/SPARK-34436
 @ignore_order
-@allow_non_gpu('FilterExec')
+@allow_non_gpu(*dpp_like_any_fallback_execs)
 @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn)
 @pytest.mark.parametrize('aqe_enabled', [
     'false',
@@ -327,6 +336,7 @@ def create_dim_table_for_like(spark):
 
 
 @disable_ansi_mode  # https://github.com/NVIDIA/spark-rapids/issues/5114
+@allow_non_gpu(*dpp_fallback_execs)
 # Test handling DPP expressions from a HashedRelation that rearranges columns
 @pytest.mark.parametrize('aqe_enabled', [
     'false',

From 0bf85cb9e65928328e3a2e0ec3837825f9be5f2d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 27 Nov 2024 08:39:11 +0800
Subject: [PATCH 13/37] Update rapids JNI and private dependency to
 25.02.0-SNAPSHOT (#11772)

To fix: https://github.com/NVIDIA/spark-rapids/issues/11755\nWait for the pre-merge CI job to SUCCEED

Signed-off-by: nvauto <70000568+nvauto@users.noreply.github.com>
---
 jenkins/databricks/init_cudf_udf.sh | 3 +--
 pom.xml                             | 5 ++---
 scala2.13/pom.xml                   | 5 ++---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 94ca7473143..0898c230d48 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -20,8 +20,7 @@
 
 set -ex
 
-# TODO: https://github.com/NVIDIA/spark-rapids/issues/11755
-CUDF_VER=${CUDF_VER:-24.12}
+CUDF_VER=${CUDF_VER:-25.02}
 CUDA_VER=${CUDA_VER:-11.8}
 
 # Need to explicitly add conda into PATH environment, to activate conda environment.
diff --git a/pom.xml b/pom.xml
index 7409b849968..c5adf511d97 100644
--- a/pom.xml
+++ b/pom.xml
@@ -828,9 +828,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <!-- TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 -->
-        <spark-rapids-jni.version>24.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>24.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>25.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>25.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 9c00390f6e5..8a078e6e0d0 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -828,9 +828,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <!-- TODO: https://github.com/NVIDIA/spark-rapids/issues/11755 -->
-        <spark-rapids-jni.version>24.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>24.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>25.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>25.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.13</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>

From 5b77ed736d61932147f29f0d7de2d5c7335a903c Mon Sep 17 00:00:00 2001
From: Chong Gao <chongg@nvidia.com>
Date: Wed, 27 Nov 2024 17:41:19 +0800
Subject: [PATCH 14/37] Update advanced configs introduced by private repo
 (#11785)

Signed-off-by: Chong Gao <res_life@163.com>
Co-authored-by: Chong Gao <res_life@163.com>
---
 docs/additional-functionality/advanced_configs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md
index 07346a5b850..a4427d9495a 100644
--- a/docs/additional-functionality/advanced_configs.md
+++ b/docs/additional-functionality/advanced_configs.md
@@ -33,6 +33,7 @@ Name | Description | Default Value | Applicable at
 <a name="filecache.blockPathRegexp"></a>spark.rapids.filecache.blockPathRegexp|A regular expression to decide which paths will not be cached when the file cache is enabled. If a path is blocked by this regexp but is allowed by spark.rapids.filecache.allowPathRegexp, then the path is blocked.|None|Startup
 <a name="filecache.checkStale"></a>spark.rapids.filecache.checkStale|Controls whether the cached is checked for being out of date with respect to the input file. When enabled, the data that has been cached locally for a file will be invalidated if the file is updated after being cached. This feature is only necessary if an input file for a Spark application can be changed during the lifetime of the application. If an individual input file will not be overwritten during the Spark application then performance may be improved by setting this to false.|true|Startup
 <a name="filecache.maxBytes"></a>spark.rapids.filecache.maxBytes|Controls the maximum amount of data that will be cached locally. If left unspecified, it will use half of the available disk space detected on startup for the configured Spark local disks.|None|Startup
+<a name="filecache.minimumFreeSpace.bytes"></a>spark.rapids.filecache.minimumFreeSpace.bytes|Specify the minimum amount of free space in the Spark local disks. When the amount of free space on the Spark local disks drops below this value, cache data will be removed automatically to free disk space. A zero or negative value will disable this feature. Note if multiple Spark applications running on the same node, or there are other applications running with heavy disk writing, the filecache may not drop caches in time and may cause full disk errors. Please increase this value for this case.|32212254720|Startup
 <a name="filecache.useChecksums"></a>spark.rapids.filecache.useChecksums|Whether to write out and verify checksums for the cached local files.|false|Startup
 <a name="gpu.resourceName"></a>spark.rapids.gpu.resourceName|The name of the Spark resource that represents a GPU that you want the plugin to use if using custom resources with Spark.|gpu|Startup
 <a name="memory.gpu.allocFraction"></a>spark.rapids.memory.gpu.allocFraction|The fraction of available (free) GPU memory that should be allocated for pooled memory. This must be less than or equal to the maximum limit configured via spark.rapids.memory.gpu.maxAllocFraction, and greater than or equal to the minimum limit configured via spark.rapids.memory.gpu.minAllocFraction.|1.0|Startup

From ca466e7afa3a98ceaaffffe6b4bdee6b60066b68 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 27 Nov 2024 10:13:06 -0600
Subject: [PATCH 15/37] Remove unnecessary toBeReturned field from serialized
 batch iterators (#11778)

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
---
 .../rapids/GpuColumnarBatchSerializer.scala   | 92 +++++++------------
 1 file changed, 34 insertions(+), 58 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 54252253d38..8fde39eecf8 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -48,15 +48,13 @@ trait BaseSerializedTableIterator extends Iterator[(Int, ColumnarBatch)] {
 class SerializedBatchIterator(dIn: DataInputStream)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[SerializedTableHeader] = None
-  private[this] var toBeReturned: Option[ColumnarBatch] = None
   private[this] var streamClosed: Boolean = false
 
   // Don't install the callback if in a unit test
   Option(TaskContext.get()).foreach { tc =>
     onTaskCompletion(tc) {
-      toBeReturned.foreach(_.close())
-      toBeReturned = None
       dIn.close()
+      streamClosed = true
     }
   }
 
@@ -80,23 +78,20 @@ class SerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def tryReadNext(): Option[ColumnarBatch] = {
-    if (nextHeader.isEmpty) {
-      None
-    } else {
-      withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
-        val header = nextHeader.get
-        if (header.getNumColumns > 0) {
-          // This buffer will later be concatenated into another host buffer before being
-          // sent to the GPU, so no need to use pinned memory for these buffers.
-          closeOnExcept(
-            HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
-            JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
-            Some(SerializedTableColumn.from(header, hostBuffer))
-          }
-        } else {
-          Some(SerializedTableColumn.from(header))
+  private def readNextBatch(): ColumnarBatch = {
+    withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
+      val header = nextHeader.get
+      nextHeader = None
+      if (header.getNumColumns > 0) {
+        // This buffer will later be concatenated into another host buffer before being
+        // sent to the GPU, so no need to use pinned memory for these buffers.
+        closeOnExcept(
+          HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
+          JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
+          SerializedTableColumn.from(header, hostBuffer)
         }
+      } else {
+        SerializedTableColumn.from(header)
       }
     }
   }
@@ -107,17 +102,10 @@ class SerializedBatchIterator(dIn: DataInputStream)
   }
 
   override def next(): (Int, ColumnarBatch) = {
-    if (toBeReturned.isEmpty) {
-      peekNextBatchSize()
-      toBeReturned = tryReadNext()
-      if (nextHeader.isEmpty || toBeReturned.isEmpty) {
-        throw new NoSuchElementException("Walked off of the end...")
-      }
+    if (!hasNext) {
+      throw new NoSuchElementException("Walked off of the end...")
     }
-    val ret = toBeReturned.get
-    toBeReturned = None
-    nextHeader = None
-    (0, ret)
+    (0, readNextBatch())
   }
 }
 
@@ -498,15 +486,13 @@ object KudoSerializedTableColumn {
 class KudoSerializedBatchIterator(dIn: DataInputStream)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[KudoTableHeader] = None
-  private[this] var toBeReturned: Option[ColumnarBatch] = None
   private[this] var streamClosed: Boolean = false
 
   // Don't install the callback if in a unit test
   Option(TaskContext.get()).foreach { tc =>
     onTaskCompletion(tc) {
-      toBeReturned.foreach(_.close())
-      toBeReturned = None
       dIn.close()
+      streamClosed = true
     }
   }
 
@@ -530,23 +516,20 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def tryReadNext(): Option[ColumnarBatch] = {
-    if (nextHeader.isEmpty) {
-      None
-    } else {
-      withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
-        val header = nextHeader.get
-        if (header.getNumColumns > 0) {
-          // This buffer will later be concatenated into another host buffer before being
-          // sent to the GPU, so no need to use pinned memory for these buffers.
-          closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer =>
-            hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen)
-            val kudoTable = new KudoTable(header, hostBuffer)
-            Some(KudoSerializedTableColumn.from(kudoTable))
-          }
-        } else {
-          Some(KudoSerializedTableColumn.from(new KudoTable(header, null)))
+  private def readNextBatch(): ColumnarBatch = {
+    withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
+      val header = nextHeader.get
+      nextHeader = None
+      if (header.getNumColumns > 0) {
+        // This buffer will later be concatenated into another host buffer before being
+        // sent to the GPU, so no need to use pinned memory for these buffers.
+        closeOnExcept(HostMemoryBuffer.allocate(header.getTotalDataLen, false)) { hostBuffer =>
+          hostBuffer.copyFromStream(0, dIn, header.getTotalDataLen)
+          val kudoTable = new KudoTable(header, hostBuffer)
+          KudoSerializedTableColumn.from(kudoTable)
         }
+      } else {
+        KudoSerializedTableColumn.from(new KudoTable(header, null))
       }
     }
   }
@@ -557,16 +540,9 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
   }
 
   override def next(): (Int, ColumnarBatch) = {
-    if (toBeReturned.isEmpty) {
-      peekNextBatchSize()
-      toBeReturned = tryReadNext()
-      if (nextHeader.isEmpty || toBeReturned.isEmpty) {
-        throw new NoSuchElementException("Walked off of the end...")
-      }
+    if (!hasNext) {
+      throw new NoSuchElementException("Walked off of the end...")
     }
-    val ret = toBeReturned.get
-    toBeReturned = None
-    nextHeader = None
-    (0, ret)
+    (0, readNextBatch())
   }
 }

From 4b1a401fd6e516cf0855bbb164d05ef981d5df27 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Thu, 28 Nov 2024 11:42:37 +0800
Subject: [PATCH 16/37] Support running CI_PART2 integration tests with JARs
 built by CI_PART1 (#11788)

The CI_PART1 job uploads the built Spark Rapids tar file to Databricks DBFS storage.

The CI_PART2 job retrieves the built tar file from DBFS storage and runs integration tests against it.

Signed-off-by: timl <timl@nvidia.com>
---
 jenkins/databricks/build.sh | 12 ++++++++++++
 jenkins/databricks/test.sh  | 25 ++++++++++++++++---------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index f6ff6e913b6..baec99bb015 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -178,5 +178,17 @@ if [[ "$WITH_DEFAULT_UPSTREAM_SHIM" != "0" ]]; then
         -Dincluded_buildvers=$UPSTREAM_BUILDVER,$BUILDVER
 fi
 
+# "Delete the unused object files to reduce the size of the Spark Rapids built tar."
+rm -rf dist/target/jni-deps/
+find dist/target/parallel-world/ -mindepth 1 -maxdepth 1 ! -name META-INF -exec rm -rf {} +
+
 cd /home/ubuntu
 tar -zcf spark-rapids-built.tgz spark-rapids
+
+# Back up spark rapids built jars for the CI_PART2 job to run integration tests
+TEST_MODE=${TEST_MODE:-'DEFAULT'}
+PLUGIN_BUILT_TGZ=${PLUGIN_BUILT_TGZ:-"$1"}
+if [[ "$TEST_MODE" == "CI_PART1"  && -n "$PLUGIN_BUILT_TGZ" ]]; then
+   mkdir -p $(dirname $PLUGIN_BUILT_TGZ)
+   cp spark-rapids-built.tgz $PLUGIN_BUILT_TGZ
+fi
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index 38728161d12..abe09b226b4 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -38,6 +38,22 @@
 
 set -ex
 
+## 'foo=abc,bar=123,...' to 'export foo=abc bar=123 ...'
+[[ -n "$EXTRA_ENVS" ]] && export ${EXTRA_ENVS//','/' '}
+# TEST_MODE
+# - DEFAULT: all tests except cudf_udf tests
+# - DELTA_LAKE_ONLY: delta_lake tests only
+# - MULTITHREADED_SHUFFLE: shuffle tests only
+# - PYARROW_ONLY: pyarrow tests only
+# - CI_PART1 or CI_PART2 : part1 or part2 of the tests run in parallel from CI
+TEST_MODE=${TEST_MODE:-'DEFAULT'}
+
+# CI_PART2 untars the spark-rapids tgz built by C1_PART1 instead of rebuilding it
+PLUGIN_BUILT_TGZ=${PLUGIN_BUILT_TGZ:-"$1"}
+if [[ "$TEST_MODE" == "CI_PART2"  && -z "$LOCAL_JAR_PATH" && -f "$PLUGIN_BUILT_TGZ" ]]; then
+    tar -zxf $PLUGIN_BUILT_TGZ
+fi
+
 SOURCE_PATH="/home/ubuntu/spark-rapids"
 [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
 
@@ -54,15 +70,6 @@ WITH_DEFAULT_UPSTREAM_SHIM=${WITH_DEFAULT_UPSTREAM_SHIM:-1}
 IS_SPARK_321_OR_LATER=0
 [[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1
 
-
-# TEST_MODE
-# - DEFAULT: all tests except cudf_udf tests
-# - DELTA_LAKE_ONLY: delta_lake tests only
-# - MULTITHREADED_SHUFFLE: shuffle tests only
-# - PYARROW_ONLY: pyarrow tests only
-# - CI_PART1 or CI_PART2 : part1 or part2 of the tests run in parallel from CI
-TEST_MODE=${TEST_MODE:-'DEFAULT'}
-
 # Classloader config is here to work around classloader issues with
 # --packages in distributed setups, should be fixed by
 # https://github.com/NVIDIA/spark-rapids/pull/5646

From aa2da410511d8a737e207257769ec662a79174fe Mon Sep 17 00:00:00 2001
From: "Hongbin Ma (Mahone)" <mahongbin@apache.org>
Date: Fri, 29 Nov 2024 23:26:13 +0800
Subject: [PATCH 17/37] fix issue 11790 (#11792)

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>
---
 .../spark/rapids/GpuAggregateExec.scala       | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
index 60f6dd68509..4ba20547e77 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
@@ -219,9 +219,6 @@ object AggregateUtils extends Logging {
   ): Boolean = {
 
     var repartitionHappened = false
-    if (hashSeed > 200) {
-      throw new IllegalStateException("Too many times of repartition, may hit a bug?")
-    }
 
     def repartitionAndClose(batch: SpillableColumnarBatch): Unit = {
 
@@ -280,15 +277,23 @@ object AggregateUtils extends Logging {
 
       val newBuckets = batchesByBucket.flatMap(bucket => {
         if (needRepartitionAgain(bucket)) {
-          val nextLayerBuckets =
-            ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]())
-          // Recursively merge and repartition the over sized bucket
-          repartitionHappened =
-            iterateAndRepartition(
-              new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize,
-              helper, hashKeys, hashBucketNum, hashSeed + 7,
-              nextLayerBuckets) || repartitionHappened
-          nextLayerBuckets
+          if (hashSeed + 7 > 200) {
+            log.warn("Too many times of repartition, may hit a bug? Size for each batch in " +
+              "current bucket: " + bucket.map(_.sizeInBytes).mkString(", ") + " rows: " +
+              bucket.map(_.numRows()).mkString(", ") + " targetMergeBatchSize: "
+              + targetMergeBatchSize)
+            ArrayBuffer.apply(bucket)
+          } else {
+            val nextLayerBuckets =
+              ArrayBuffer.fill(hashBucketNum)(new AutoClosableArrayBuffer[SpillableColumnarBatch]())
+            // Recursively merge and repartition the over sized bucket
+            repartitionHappened =
+              iterateAndRepartition(
+                new CloseableBufferedIterator(bucket.iterator), metrics, targetMergeBatchSize,
+                helper, hashKeys, hashBucketNum, hashSeed + 7,
+                nextLayerBuckets) || repartitionHappened
+            nextLayerBuckets
+          }
         } else {
           ArrayBuffer.apply(bucket)
         }
@@ -1075,8 +1080,8 @@ class GpuMergeAggregateIterator(
           closeOnExcept(new ArrayBuffer[AutoClosableArrayBuffer[SpillableColumnarBatch]]) {
             toAggregateBuckets =>
               var currentSize = 0L
-              while (batchesByBucket.nonEmpty &&
-                batchesByBucket.last.size() + currentSize < targetMergeBatchSize) {
+              while (batchesByBucket.nonEmpty && (toAggregateBuckets.isEmpty ||
+                batchesByBucket.last.size() + currentSize < targetMergeBatchSize)) {
                 val bucket = batchesByBucket.remove(batchesByBucket.size - 1)
                 currentSize += bucket.map(_.sizeInBytes).sum
                 toAggregateBuckets += bucket

From bd14dbff6d5270c7374dc9e2ce6d00ff7e902420 Mon Sep 17 00:00:00 2001
From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com>
Date: Mon, 2 Dec 2024 09:56:22 +0800
Subject: [PATCH 18/37] Incorporate checksum of internal dependencies in the GH
 cache key [skip ci] (#11791)

* replace date with jni&private timestamp for cache key

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* use date if quering timestamp failed

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* add bash script to get timestamp

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* replace timestamp with sha1

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

---------

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>
---
 .github/workflows/mvn-verify-check.yml        |  6 ++--
 .../mvn-verify-check/get-deps-sha1.sh         | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100755 .github/workflows/mvn-verify-check/get-deps-sha1.sh

diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
index 0aca7bc3655..b58799c6110 100644
--- a/.github/workflows/mvn-verify-check.yml
+++ b/.github/workflows/mvn-verify-check.yml
@@ -53,7 +53,8 @@ jobs:
         id: generateCacheKey
         run: |
           set -x
-          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
+          depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12)
+          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
           echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
       - name: Cache local Maven repository
         id: cache
@@ -165,7 +166,8 @@ jobs:
         id: generateCacheKey
         run: |
           set -x
-          cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
+          depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13)
+          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
           echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
       - name: Cache local Maven repository
         id: cache
diff --git a/.github/workflows/mvn-verify-check/get-deps-sha1.sh b/.github/workflows/mvn-verify-check/get-deps-sha1.sh
new file mode 100755
index 00000000000..aa7129bd3ef
--- /dev/null
+++ b/.github/workflows/mvn-verify-check/get-deps-sha1.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+scala_ver=${1:-"2.12"}
+base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve"
+project_jni="spark-rapids-jni"
+project_private="rapids-4-spark-private_${scala_ver}"
+
+jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout)
+private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout)
+
+jni_sha1=$(curl -s -H "Accept: application/json" \
+  "${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \
+  | jq .data.sha1) || $(date +'%Y-%m-%d')
+private_sha1=$(curl -s -H "Accept: application/json" \
+  "${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \
+  | jq .data.sha1) || $(date +'%Y-%m-%d')
+
+sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}')
+
+echo $sha1md5

From cb31afb07847ff96b16d70ceec54ee1426fe5e64 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 2 Dec 2024 18:19:17 -0600
Subject: [PATCH 19/37] Fall back to CPU for non-UTC months_between (#11802)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 integration_tests/src/main/python/date_time_test.py  | 12 ++++++------
 .../spark/sql/rapids/datetimeExpressions.scala       | 10 +++++++++-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py
index 5a98e06fadc..1a7024dac85 100644
--- a/integration_tests/src/main/python/date_time_test.py
+++ b/integration_tests/src/main/python/date_time_test.py
@@ -139,34 +139,34 @@ def test_datediff(data_gen):
 
 hms_fallback = ['ProjectExec'] if not is_supported_time_zone() else []
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between_first_day():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between_last_day():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_round():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, true)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_first_day_round():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", true)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_last_day_round():
     assert_gpu_and_cpu_are_equal_collect(
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 8ed4c50ac3b..0f382a7b6e6 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -1217,7 +1217,8 @@ class MonthsBetweenExprMeta(expr: MonthsBetween,
                             rule: DataFromReplacementRule)
   extends ExprMeta[MonthsBetween](expr, conf, parent, rule) {
 
-  override def isTimeZoneSupported = true
+  // See https://github.com/NVIDIA/spark-rapids/issues/11800
+  override def isTimeZoneSupported = false
 
   override def convertToGpu(): GpuExpression = {
     val gpuChildren = childExprs.map(_.convertToGpu())
@@ -1287,6 +1288,13 @@ object GpuMonthsBetween {
   private def calcSecondsInDay(converted: ColumnVector): ColumnVector = {
     // Find the number of seconds that are not counted for in a day
 
+    // Rounding down to the current day, only works if you are in a time zone with no
+    // transition rules. This is because if a transition happens in between the start
+    // of the day and the timestamp we will be off. As such this will need to change to
+    // support other time zones, and it will need to take the timezone into account when
+    // calculating this.
+    // https://github.com/NVIDIA/spark-rapids/issues/11800
+
     // find the micros over by finding the part that is not days
     val microsInDay = withResource(converted.dateTimeFloor(DateTimeRoundingFrequency.DAY)) { days =>
       // But we cannot subtract timestamps directly. They are both micros

From 738c8e38fc23c1634667443864b80f085f2737ac Mon Sep 17 00:00:00 2001
From: "Hongbin Ma (Mahone)" <mahongbin@apache.org>
Date: Tue, 3 Dec 2024 09:07:10 +0800
Subject: [PATCH 20/37] exclude previous operator's time out of
 firstBatchHeuristic (#11794)

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>
---
 .../main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
index 4ba20547e77..d5bbe15209d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
@@ -2091,9 +2091,9 @@ class DynamicGpuPartialAggregateIterator(
       helper: AggHelper): (Iterator[ColumnarBatch], Boolean) = {
     // we need to decide if we are going to sort the data or not, so the very
     // first thing we need to do is get a batch and make a choice.
+    val cb = cbIter.next()
     withResource(new NvtxWithMetrics("dynamic sort heuristic", NvtxColor.BLUE,
       metrics.opTime, metrics.heuristicTime)) { _ =>
-      val cb = cbIter.next()
       lazy val estimatedGrowthAfterAgg: Double = closeOnExcept(cb) { cb =>
         val numRows = cb.numRows()
         val cardinality = estimateCardinality(cb)

From 7927ae9b3e6f565c9e7ba45c5353dbccbdd6d483 Mon Sep 17 00:00:00 2001
From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com>
Date: Thu, 5 Dec 2024 09:13:34 +0800
Subject: [PATCH 21/37] enable license header check & add header to files
 (#11786)

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>
---
 .github/workflows/license-header-check.yml | 58 ++++++++++++++++++++++
 docs/dev/idea-code-style-settings.xml      | 16 ++++++
 python/rapids/daemon.py                    |  1 +
 python/rapids/daemon_databricks.py         |  1 +
 4 files changed, 76 insertions(+)
 create mode 100644 .github/workflows/license-header-check.yml

diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
new file mode 100644
index 00000000000..e7f62399436
--- /dev/null
+++ b/.github/workflows/license-header-check.yml
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.yml,
+            *.yaml,
+            *.sh,
+            *.xml,
+            *.properties,
+            *.scala,
+            *.py,
+            build/*,
+            *.cpp,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.ini,
+            *.java,
+            *.fbs
+          excluded_file_patterns: |
+            *target/*,
+            thirdparty/*,
+            sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
+ 
\ No newline at end of file
diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml
index 165d30dde06..9f5c3c100dc 100644
--- a/docs/dev/idea-code-style-settings.xml
+++ b/docs/dev/idea-code-style-settings.xml
@@ -1,3 +1,19 @@
+<!--
+ Copyright (c) 2024, NVIDIA CORPORATION.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> 
+
 <code_scheme name="Default" version="173">
   <option name="SOFT_MARGINS" value="100" />
   <JavaCodeStyleSettings>
diff --git a/python/rapids/daemon.py b/python/rapids/daemon.py
index 31353bd92ab..3d00e609255 100644
--- a/python/rapids/daemon.py
+++ b/python/rapids/daemon.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
diff --git a/python/rapids/daemon_databricks.py b/python/rapids/daemon_databricks.py
index 9f4552886e2..fd71bb06f4c 100644
--- a/python/rapids/daemon_databricks.py
+++ b/python/rapids/daemon_databricks.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with

From 29131281ed2958399e5e230d1db1a79ed69518bc Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Thu, 5 Dec 2024 18:08:45 +0800
Subject: [PATCH 22/37] Add the 'test_type' parameter for Databricks script
 (#11819)

* Add the 'test_type' parameter for Databricks script

For fixing: https://github.com/NVIDIA/spark-rapids/issues/11818

'nightly' is for nightly CI, 'pre-commit' is for the pre-merge CI

the pre-merge CI does not need to copy the Rapids plugin built tar from the Databricks cluster back to the local host,

only the nightly build needs to copy the spark-rapids-built.tgz back

Signed-off-by: timl <timl@nvidia.com>

* Update copyright

Signed-off-by: timl <timl@nvidia.com>

---------

Signed-off-by: timl <timl@nvidia.com>
---
 jenkins/Jenkinsfile-blossom.premerge-databricks |  1 +
 jenkins/databricks/params.py                    | 15 +++++++++++----
 jenkins/databricks/run-build.py                 | 12 +++++++-----
 jenkins/databricks/run-tests.py                 |  6 +++---
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index 2f8b926898a..147b6a40e98 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -69,6 +69,7 @@ pipeline {
         DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
         DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
         INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
+        TEST_TYPE = 'pre-commit'
     }
 
     stages {
diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
index dce2436a6e6..183ae846208 100644
--- a/jenkins/databricks/params.py
+++ b/jenkins/databricks/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@
 spark_conf = ''
 # can take comma separated environments, e.g., foo=abc,bar=123,...'
 extra_envs = ''
+# 'nightly' is for nightly CI, 'pre-commit' is for the pre-merge CI
+test_type = 'nightly'
 
 
 def usage():
@@ -51,11 +53,12 @@ def usage():
           ' -n <skipstartingcluster>'
           ' -f <sparkconf>'
           ' -i <sparkinstallver>'
-          ' -e <extraenvs>')
+          ' -e <extraenvs>'
+          ' -m <testtype>')
 
 
 try:
-    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
+    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:m:',
                                       ['workspace=',
                                        'token=',
                                        'clusterid=',
@@ -68,7 +71,8 @@ def usage():
                                        'jarpath=',
                                        'sparkconf=',
                                        'sparkinstallver=',
-                                       'extraenvs='])
+                                       'extraenvs=',
+                                       'testtype='])
 except getopt.GetoptError:
     usage()
     sys.exit(2)
@@ -103,6 +107,8 @@ def usage():
         base_spark_version_to_install_databricks_jars = arg
     elif opt in ('-e', '--extraenvs'):
         extra_envs = arg
+    elif opt in ('-m', '--testtype'):
+        test_type = arg
 
 print('-w is ' + workspace)
 print('-c is ' + clusterid)
@@ -116,3 +122,4 @@ def usage():
 print('-f is ' + spark_conf)
 print('-i is ' + base_spark_version_to_install_databricks_jars)
 print('-e is ' + extra_envs)
+print('-m is ' + test_type)
diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py
index 277c4f7024c..45d5dcea1bd 100644
--- a/jenkins/databricks/run-build.py
+++ b/jenkins/databricks/run-build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,10 +46,12 @@ def main():
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
-  print("Copying built tarball back")
-  rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
-  print("rsync command to get built tarball: %s" % rsync_command)
-  subprocess.check_call(rsync_command, shell = True)
+  # Only the nightly build needs to copy the spark-rapids-built.tgz back
+  if params.test_type == 'nightly':
+      print("Copying built tarball back")
+      rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
+      print("rsync command to get built tarball: %s" % rsync_command)
+      subprocess.check_call(rsync_command, shell = True)
 
 if __name__ == '__main__':
   main()
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
index cd0f8f0e04c..5d02f504061 100644
--- a/jenkins/databricks/run-tests.py
+++ b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@ def main():
     subprocess.check_call(rsync_command, shell=True)
 
     ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
-        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
+        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s TEST_TYPE=%s bash %s %s 2>&1 | tee testout; " \
         "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
-        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
+        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.test_type,
          params.script_dest, ' '.join(params.script_args))
     print("ssh command: %s" % ssh_command)
     try:

From 234f4dbc86173ab4a66c22d2009500b9d391b3c8 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 5 Dec 2024 18:11:44 +0800
Subject: [PATCH 23/37] Support some escape chars when rewriting regexp_replace
 to stringReplace (#11813)

* Support some escape characters in search list when rewriting regexp_replace to string replace

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* add a case

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comment

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* update datagen

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 integration_tests/src/main/python/regexp_test.py       | 10 +++++++---
 .../scala/com/nvidia/spark/rapids/GpuOverrides.scala   |  6 +++---
 .../com/nvidia/spark/rapids/StringFunctionSuite.scala  |  5 +++--
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
index c2062605ca1..b67f9dc6679 100644
--- a/integration_tests/src/main/python/regexp_test.py
+++ b/integration_tests/src/main/python/regexp_test.py
@@ -1012,14 +1012,16 @@ def test_regexp_replace_simple(regexp_enabled):
             'REGEXP_REPLACE(a, "ab", "PROD")',
             'REGEXP_REPLACE(a, "ae", "PROD")',
             'REGEXP_REPLACE(a, "bc", "PROD")',
-            'REGEXP_REPLACE(a, "fa", "PROD")'
+            'REGEXP_REPLACE(a, "fa", "PROD")',
+            'REGEXP_REPLACE(a, "a\n", "PROD")',
+            'REGEXP_REPLACE(a, "\n", "PROD")'
         ),
         conf=conf
     )
 
 @pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
 def test_regexp_replace_multi_optimization(regexp_enabled):
-    gen = mk_str_gen('[abcdef]{0,2}')
+    gen = mk_str_gen('[abcdef\t\n\a]{0,3}')
 
     conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }
 
@@ -1032,7 +1034,9 @@ def test_regexp_replace_multi_optimization(regexp_enabled):
             'REGEXP_REPLACE(a, "aa|bb|cc|dd", "PROD")',
             'REGEXP_REPLACE(a, "(aa|bb)|(cc|dd)", "PROD")',
             'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee", "PROD")',
-            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")'
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")',
+            'REGEXP_REPLACE(a, "a\n|b\a|c\t", "PROD")',
+            'REGEXP_REPLACE(a, "a\ta|b\nb", "PROD")'
         ),
         conf=conf
     )
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index 45905f0b9e0..07b2d022f67 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -593,8 +593,9 @@ object GpuOverrides extends Logging {
   }
 
   def isSupportedStringReplacePattern(strLit: String): Boolean = {
-    // check for regex special characters, except for \u0000 which we can support
-    !regexList.filterNot(_ == "\u0000").exists(pattern => strLit.contains(pattern))
+    // check for regex special characters, except for \u0000, \n, \r, and \t which we can support
+    val supported = Seq("\u0000", "\n", "\r", "\t")
+    !regexList.filterNot(supported.contains(_)).exists(pattern => strLit.contains(pattern))
   }
 
   def isSupportedStringReplacePattern(exp: Expression): Boolean = {
@@ -605,7 +606,6 @@ object GpuOverrides extends Logging {
         if (strLit.isEmpty) {
           false
         } else {
-          // check for regex special characters, except for \u0000 which we can support
           isSupportedStringReplacePattern(strLit)
         }
       case _ => false
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
index 3c3933946c5..25c8c10b26d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,7 +207,8 @@ class RegExpUtilsSuite extends AnyFunSuite {
       "aa|bb|cc|dd" -> Seq("aa", "bb", "cc", "dd"),
       "(aa|bb)|(cc|dd)" -> Seq("aa", "bb", "cc", "dd"),
       "aa|bb|cc|dd|ee" -> Seq("aa", "bb", "cc", "dd", "ee"),
-      "aa|bb|cc|dd|ee|ff" -> Seq("aa", "bb", "cc", "dd", "ee", "ff")
+      "aa|bb|cc|dd|ee|ff" -> Seq("aa", "bb", "cc", "dd", "ee", "ff"),
+      "a\n|b\t|c\r" -> Seq("a\n", "b\t", "c\r")
     )
 
     regexChoices.foreach { case (pattern, choices) =>

From f3ac8bede0afdd877c698fb1e54edd5b7f1366c8 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Thu, 5 Dec 2024 12:20:49 -0800
Subject: [PATCH 24/37] Fix the task count check in TrafficController (#11783)

* Fix TrafficController numTasks check

Signed-off-by: Jihoon Son <ghoonson@gmail.com>

* rename weights properly

* simplify the loop condition

* Rename the condition variable for readability

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* missing renames

* add test for when all tasks are big

---------

Signed-off-by: Jihoon Son <ghoonson@gmail.com>
Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>
---
 .../rapids/io/async/ThrottlingExecutor.scala  |  2 +
 .../rapids/io/async/TrafficController.scala   | 46 +++++++----
 .../io/async/ThrottlingExecutorSuite.scala    | 16 ++--
 .../io/async/TrafficControllerSuite.scala     | 80 ++++++++++++++++++-
 4 files changed, 118 insertions(+), 26 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
index 45889bf89ac..99c3cc9ea5e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
@@ -20,6 +20,8 @@ import java.util.concurrent.{Callable, ExecutorService, Future, TimeUnit}
 
 /**
  * Thin wrapper around an ExecutorService that adds throttling.
+ *
+ * The given executor is owned by this class and will be shutdown when this class is shutdown.
  */
 class ThrottlingExecutor(
     val executor: ExecutorService, throttler: TrafficController) {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
index 0110f2d89ca..e69af5bf258 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.io.async
 
 import java.util.concurrent.Callable
+import java.util.concurrent.locks.ReentrantLock
 import javax.annotation.concurrent.GuardedBy
 
 import com.nvidia.spark.rapids.RapidsConf
@@ -85,38 +86,55 @@ class HostMemoryThrottle(val maxInFlightHostMemoryBytes: Long) extends Throttle
  *
  * This class is thread-safe as it is used by multiple tasks.
  */
-class TrafficController protected[rapids] (throttle: Throttle) {
+class TrafficController protected[rapids] (@GuardedBy("lock") throttle: Throttle) {
 
-  @GuardedBy("this")
+  @GuardedBy("lock")
   private var numTasks: Int = 0
 
+  private val lock = new ReentrantLock()
+  private val canBeScheduled = lock.newCondition()
+
   /**
    * Blocks the task from being scheduled until the throttle allows it. If there is no task
    * currently scheduled, the task is scheduled immediately even if the throttle is exceeded.
    */
-  def blockUntilRunnable[T](task: Task[T]): Unit = synchronized {
-    if (numTasks > 0) {
-      while (!throttle.canAccept(task)) {
-        wait(100)
+  def blockUntilRunnable[T](task: Task[T]): Unit = {
+    lock.lockInterruptibly()
+    try {
+      while (numTasks > 0 && !throttle.canAccept(task)) {
+        canBeScheduled.await()
       }
+      numTasks += 1
+      throttle.taskScheduled(task)
+    } finally {
+      lock.unlock()
     }
-    numTasks += 1
-    throttle.taskScheduled(task)
   }
 
-  def taskCompleted[T](task: Task[T]): Unit = synchronized {
-    numTasks -= 1
-    throttle.taskCompleted(task)
-    notify()
+  def taskCompleted[T](task: Task[T]): Unit = {
+    lock.lockInterruptibly()
+    try {
+      numTasks -= 1
+      throttle.taskCompleted(task)
+      canBeScheduled.signal()
+    } finally {
+      lock.unlock()
+    }
   }
 
-  def numScheduledTasks: Int = synchronized {
-    numTasks
+  def numScheduledTasks: Int = {
+    lock.lockInterruptibly()
+    try {
+      numTasks
+    } finally {
+      lock.unlock()
+    }
   }
 }
 
 object TrafficController {
 
+  @GuardedBy("this")
   private var instance: TrafficController = _
 
   /**
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
index a8acf240878..86fb692cd64 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
@@ -73,17 +73,17 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(0)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("tasks submission fails if total weight exceeds maxWeight") {
+  test("tasks submission fails if totalHostMemoryBytes exceeds maxHostMemoryBytes") {
     val task1 = new TestTask
     val future1 = executor.submit(task1, 10)
     assertResult(1)(trafficController.numScheduledTasks)
     assertResult(10)(throttle.getTotalHostMemoryBytes)
 
     val task2 = new TestTask
-    val task2Weight = 100
+    val task2HostMemory = 100
     val exec = Executors.newSingleThreadExecutor()
     val future2 = exec.submit(new Runnable {
-      override def run(): Unit = executor.submit(task2, task2Weight)
+      override def run(): Unit = executor.submit(task2, task2HostMemory)
     })
     Thread.sleep(100)
     assert(!future2.isDone)
@@ -94,10 +94,10 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     future1.get(longTimeoutSec, TimeUnit.SECONDS)
     future2.get(longTimeoutSec, TimeUnit.SECONDS)
     assertResult(1)(trafficController.numScheduledTasks)
-    assertResult(task2Weight)(throttle.getTotalHostMemoryBytes)
+    assertResult(task2HostMemory)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("submit one task heavier than maxWeight") {
+  test("submit one task heavier than maxHostMemoryBytes") {
     val future = executor.submit(() => Thread.sleep(10), throttle.maxInFlightHostMemoryBytes + 1)
     future.get(longTimeoutSec, TimeUnit.SECONDS)
     assert(future.isDone)
@@ -105,7 +105,7 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(0)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("submit multiple tasks such that total weight does not exceed maxWeight") {
+  test("submit multiple tasks such that totalHostMemoryBytes does not exceed maxHostMemoryBytes") {
     val numTasks = 10
     val taskRunTime = 10
     var future: Future[Unit] = null
@@ -125,10 +125,10 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(10)(throttle.getTotalHostMemoryBytes)
 
     val task2 = new TestTask
-    val task2Weight = 100
+    val task2HostMemory = 100
     val exec = Executors.newSingleThreadExecutor()
     val future2 = exec.submit(new Runnable {
-      override def run(): Unit = executor.submit(task2, task2Weight)
+      override def run(): Unit = executor.submit(task2, task2HostMemory)
     })
     executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS)
 
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
index 32868ff6055..1c06755a8af 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
@@ -16,19 +16,34 @@
 
 package com.nvidia.spark.rapids.io.async
 
-import java.util.concurrent.{ExecutionException, Executors, ExecutorService, TimeUnit}
+import java.util.concurrent.{ExecutionException, Executors, ExecutorService, Future, TimeUnit}
 
 import org.scalatest.BeforeAndAfterEach
+import org.scalatest.concurrent.TimeLimitedTests
 import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.time.Span
+import org.scalatest.time.SpanSugar._
 
-class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach {
+class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach with TimeLimitedTests {
 
-  private var throttle: HostMemoryThrottle = _
+  class RecordingExecOrderHostMemoryThrottle(maxInFlightHostMemoryBytes: Long)
+    extends HostMemoryThrottle(maxInFlightHostMemoryBytes) {
+    var tasksScheduled = Seq.empty[TestTask]
+
+    override def taskScheduled[T](task: Task[T]): Unit = {
+      tasksScheduled = tasksScheduled :+ task.asInstanceOf[TestTask]
+      super.taskScheduled(task)
+    }
+  }
+
+  val timeLimit: Span = 10.seconds
+
+  private var throttle: RecordingExecOrderHostMemoryThrottle = _
   private var controller: TrafficController = _
   private var executor: ExecutorService = _
 
   override def beforeEach(): Unit = {
-    throttle = new HostMemoryThrottle(100)
+    throttle = new RecordingExecOrderHostMemoryThrottle(100)
     controller = new TrafficController(throttle)
     executor = Executors.newSingleThreadExecutor()
   }
@@ -76,6 +91,63 @@ class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach {
     f.get(1, TimeUnit.SECONDS)
   }
 
+  test("big task should be scheduled after all running tasks are completed") {
+    val taskMemoryBytes = 50
+    val t1 = new TestTask(taskMemoryBytes)
+    controller.blockUntilRunnable(t1)
+
+    val t2 = new TestTask(150)
+    val f = executor.submit(new Runnable {
+      override def run(): Unit = controller.blockUntilRunnable(t2)
+    })
+    Thread.sleep(100)
+    assert(!f.isDone)
+
+    controller.taskCompleted(t1)
+    f.get(1, TimeUnit.SECONDS)
+  }
+
+  test("all tasks are bigger than the total memory limit") {
+    val bigTaskMemoryBytes = 130
+    val (tasks, futures) = (0 to 2).map { _ =>
+      val t = new TestTask(bigTaskMemoryBytes)
+      val f: Future[_] = executor.submit(new Runnable {
+        override def run(): Unit = controller.blockUntilRunnable(t)
+      })
+      (t, f.asInstanceOf[Future[Unit]])
+    }.unzip
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(0).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled.head)(tasks(0))
+
+    // The first task has been completed
+    controller.taskCompleted(tasks(0))
+    // Wait for the second task to be scheduled
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(1).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled(1))(tasks(1))
+
+    // The second task has been completed
+    controller.taskCompleted(tasks(1))
+    // Wait for the third task to be scheduled
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(2).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled(2))(tasks(2))
+
+    // The third task has been completed
+    controller.taskCompleted(tasks(2))
+    assertResult(0)(controller.numScheduledTasks)
+  }
+
   test("shutdown while blocking") {
     val t1 = new TestTask(10)
     controller.blockUntilRunnable(t1)

From 30c4ddb8d2232407a55855350c7a25d4d285824e Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Fri, 6 Dec 2024 09:29:23 +0800
Subject: [PATCH 25/37] Add support for kudo write metrics (#11784)

* Add support for kudo write metrics

* Refactor

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>

* Address comments

* Resolve comments

* Fix compiler

* Fix build break

* Fix build break

* Fix build break

* Fix build break

---------

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>
---
 .../delta/GpuOptimizeWriteExchangeExec.scala  | 24 ++---
 .../rapids/GpuColumnarBatchSerializer.scala   | 53 +++++++----
 .../com/nvidia/spark/rapids/GpuExec.scala     | 10 +-
 .../RapidsShuffleInternalManagerBase.scala    | 17 ++--
 .../GpuShuffleExchangeExecBase.scala          | 92 +++++++++++++++----
 .../RapidsShuffleThreadedReaderSuite.scala    |  3 +-
 6 files changed, 132 insertions(+), 67 deletions(-)

diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
index 0c212d6842a..5b26943a541 100644
--- a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
+++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
 import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
@@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
   private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   override lazy val allMetrics: Map[String, GpuMetric] = {
     Map(
@@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
   }
 
   private lazy val serializer: Serializer =
-    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
+    new GpuColumnarBatchSerializer(allMetrics,
       child.output.map(_.dataType).toArray,
       RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 8fde39eecf8..82f368936a7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -31,6 +31,10 @@ import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable, KudoTableHea
 
 import org.apache.spark.TaskContext
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.{METRIC_DATA_SIZE,
+  METRIC_SHUFFLE_DESER_STREAM_TIME, METRIC_SHUFFLE_SER_CALC_HEADER_TIME,
+  METRIC_SHUFFLE_SER_COPY_BUFFER_TIME, METRIC_SHUFFLE_SER_COPY_HEADER_TIME,
+  METRIC_SHUFFLE_SER_STREAM_TIME}
 import org.apache.spark.sql.types.{DataType, NullType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -45,7 +49,7 @@ trait BaseSerializedTableIterator extends Iterator[(Int, ColumnarBatch)] {
   def peekNextBatchSize(): Option[Long]
 }
 
-class SerializedBatchIterator(dIn: DataInputStream)
+class SerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[SerializedTableHeader] = None
   private[this] var streamClosed: Boolean = false
@@ -58,7 +62,7 @@ class SerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  override def peekNextBatchSize(): Option[Long] = {
+  override def peekNextBatchSize(): Option[Long] = deserTime.ns {
     if (streamClosed) {
       None
     } else {
@@ -78,7 +82,7 @@ class SerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def readNextBatch(): ColumnarBatch = {
+  private def readNextBatch(): ColumnarBatch = deserTime.ns {
     withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
       val header = nextHeader.get
       nextHeader = None
@@ -125,26 +129,32 @@ class SerializedBatchIterator(dIn: DataInputStream)
  *
  * @note The RAPIDS shuffle does not use this code.
  */
-class GpuColumnarBatchSerializer(dataSize: GpuMetric, dataTypes: Array[DataType], useKudo: Boolean)
+class GpuColumnarBatchSerializer(metrics: Map[String, GpuMetric], dataTypes: Array[DataType],
+    useKudo: Boolean)
   extends Serializer with Serializable {
   override def newInstance(): SerializerInstance = {
     if (useKudo) {
-      new KudoSerializerInstance(dataSize, dataTypes)
+      new KudoSerializerInstance(metrics, dataTypes)
     } else {
-      new GpuColumnarBatchSerializerInstance(dataSize)
+      new GpuColumnarBatchSerializerInstance(metrics)
     }
   }
 
   override def supportsRelocationOfSerializedObjects: Boolean = true
 }
 
-private class GpuColumnarBatchSerializerInstance(dataSize: GpuMetric) extends SerializerInstance {
+private class GpuColumnarBatchSerializerInstance(metrics: Map[String, GpuMetric]) extends
+  SerializerInstance {
+  private val dataSize = metrics(METRIC_DATA_SIZE)
+  private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
+  private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)
+
 
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] val dOut: DataOutputStream =
       new DataOutputStream(new BufferedOutputStream(out))
 
-    override def writeValue[T: ClassTag](value: T): SerializationStream = {
+    override def writeValue[T: ClassTag](value: T): SerializationStream = serTime.ns {
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
@@ -227,7 +237,7 @@ private class GpuColumnarBatchSerializerInstance(dataSize: GpuMetric) extends Se
       private[this] val dIn: DataInputStream = new DataInputStream(new BufferedInputStream(in))
 
       override def asKeyValueIterator: Iterator[(Int, ColumnarBatch)] = {
-        new SerializedBatchIterator(dIn)
+        new SerializedBatchIterator(dIn, deserTime)
       }
 
       override def asIterator: Iterator[Any] = {
@@ -327,8 +337,14 @@ object SerializedTableColumn {
  * @param dataTypes data types of the columns in the batch
  */
 private class KudoSerializerInstance(
-    val dataSize: GpuMetric,
+    val metrics: Map[String, GpuMetric],
     val dataTypes: Array[DataType]) extends SerializerInstance {
+  private val dataSize = metrics(METRIC_DATA_SIZE)
+  private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
+  private val serCalcHeaderTime = metrics(METRIC_SHUFFLE_SER_CALC_HEADER_TIME)
+  private val serCopyHeaderTime = metrics(METRIC_SHUFFLE_SER_COPY_HEADER_TIME)
+  private val serCopyBufferTime = metrics(METRIC_SHUFFLE_SER_COPY_BUFFER_TIME)
+  private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)
 
   private lazy val kudo = new KudoSerializer(GpuColumnVector.from(dataTypes))
 
@@ -336,7 +352,7 @@ private class KudoSerializerInstance(
     private[this] val dOut: DataOutputStream =
       new DataOutputStream(new BufferedOutputStream(out))
 
-    override def writeValue[T: ClassTag](value: T): SerializationStream = {
+    override def writeValue[T: ClassTag](value: T): SerializationStream = serTime.ns {
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
@@ -368,7 +384,12 @@ private class KudoSerializerInstance(
           }
 
           withResource(new NvtxRange("Serialize Batch", NvtxColor.YELLOW)) { _ =>
-            dataSize += kudo.writeToStream(columns, dOut, startRow, numRows)
+            val writeMetric = kudo.writeToStreamWithMetrics(columns, dOut, startRow, numRows)
+
+            dataSize += writeMetric.getWrittenBytes
+            serCalcHeaderTime += writeMetric.getCalcHeaderTime
+            serCopyHeaderTime += writeMetric.getCopyHeaderTime
+            serCopyBufferTime += writeMetric.getCopyBufferTime
           }
         } else {
           withResource(new NvtxRange("Serialize Row Only Batch", NvtxColor.YELLOW)) { _ =>
@@ -410,7 +431,7 @@ private class KudoSerializerInstance(
       private[this] val dIn: DataInputStream = new DataInputStream(new BufferedInputStream(in))
 
       override def asKeyValueIterator: Iterator[(Int, ColumnarBatch)] = {
-        new KudoSerializedBatchIterator(dIn)
+        new KudoSerializedBatchIterator(dIn, deserTime)
       }
 
       override def asIterator: Iterator[Any] = {
@@ -483,7 +504,7 @@ object KudoSerializedTableColumn {
   }
 }
 
-class KudoSerializedBatchIterator(dIn: DataInputStream)
+class KudoSerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[KudoTableHeader] = None
   private[this] var streamClosed: Boolean = false
@@ -496,7 +517,7 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  override def peekNextBatchSize(): Option[Long] = {
+  override def peekNextBatchSize(): Option[Long] = deserTime.ns {
     if (streamClosed) {
       None
     } else {
@@ -516,7 +537,7 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def readNextBatch(): ColumnarBatch = {
+  private def readNextBatch(): ColumnarBatch = deserTime.ns {
     withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
       val header = nextHeader.get
       nextHeader = None
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
index 3d9b6285a91..850a04f390f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
@@ -327,19 +327,19 @@ trait GpuExec extends SparkPlan {
     }
   }
 
-  protected def createMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createMetric(sparkContext, name))
 
-  protected def createNanoTimingMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createNanoTimingMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createNanoTimingMetric(sparkContext, name))
 
-  protected def createSizeMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createSizeMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createSizeMetric(sparkContext, name))
 
-  protected def createAverageMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createAverageMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createAverageMetric(sparkContext, name))
 
-  protected def createTimingMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createTimingMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createTimingMetric(sparkContext, name))
 
   protected def createFileCacheMetrics(): Map[String, GpuMetric] = {
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
index 05bc76c3fab..fc255a6bfd0 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
@@ -42,6 +42,7 @@ import org.apache.spark.shuffle.{ShuffleWriter, _}
 import org.apache.spark.shuffle.api._
 import org.apache.spark.shuffle.sort.{BypassMergeSortShuffleHandle, SortShuffleManager}
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.{METRIC_DATA_READ_SIZE, METRIC_DATA_SIZE, METRIC_SHUFFLE_COMBINE_TIME, METRIC_SHUFFLE_DESERIALIZATION_TIME, METRIC_SHUFFLE_READ_TIME, METRIC_SHUFFLE_SERIALIZATION_TIME, METRIC_SHUFFLE_WRITE_IO_TIME, METRIC_SHUFFLE_WRITE_TIME}
 import org.apache.spark.sql.rapids.shims.{GpuShuffleBlockResolver, RapidsShuffleThreadedReader, RapidsShuffleThreadedWriter}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.storage.{RapidsShuffleBlockFetcherIterator, _}
@@ -246,13 +247,13 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
         with RapidsShuffleWriterShimHelper {
   private val metrics = handle.metrics
   private val serializationTimeMetric =
-    metrics.get("rapidsShuffleSerializationTime")
+    metrics.get(METRIC_SHUFFLE_SERIALIZATION_TIME)
   private val shuffleWriteTimeMetric =
-    metrics.get("rapidsShuffleWriteTime")
+    metrics.get(METRIC_SHUFFLE_WRITE_TIME)
   private val shuffleCombineTimeMetric =
-    metrics.get("rapidsShuffleCombineTime")
+    metrics.get(METRIC_SHUFFLE_COMBINE_TIME)
   private val ioTimeMetric =
-    metrics.get("rapidsShuffleWriteIoTime")
+    metrics.get(METRIC_SHUFFLE_WRITE_IO_TIME)
   private val dep: ShuffleDependency[K, V, V] = handle.dependency
   private val shuffleId = dep.shuffleId
   private val partitioner = dep.partitioner
@@ -596,9 +597,9 @@ abstract class RapidsShuffleThreadedReaderBase[K, C](
 
   private val sqlMetrics = handle.metrics
   private val dep = handle.dependency
-  private val deserializationTimeNs = sqlMetrics.get("rapidsShuffleDeserializationTime")
-  private val shuffleReadTimeNs = sqlMetrics.get("rapidsShuffleReadTime")
-  private val dataReadSize = sqlMetrics.get("dataReadSize")
+  private val deserializationTimeNs = sqlMetrics.get(METRIC_SHUFFLE_DESERIALIZATION_TIME)
+  private val shuffleReadTimeNs = sqlMetrics.get(METRIC_SHUFFLE_READ_TIME)
+  private val dataReadSize = sqlMetrics.get(METRIC_DATA_READ_SIZE)
 
   private var shuffleReadRange: NvtxRange =
     new NvtxRange("ThreadedReader.read", NvtxColor.PURPLE)
@@ -1048,7 +1049,7 @@ class RapidsCachingWriter[K, V](
     metrics: Map[String, SQLMetric])
   extends RapidsCachingWriterBase[K, V](blockManager, handle, mapId, rapidsShuffleServer, catalog) {
 
-  private val uncompressedMetric: SQLMetric = metrics("dataSize")
+  private val uncompressedMetric: SQLMetric = metrics(METRIC_DATA_SIZE)
 
   // This is here for the special case where we have no columns like with the .count
   // case or when we have 0-byte columns. We pick 100 as an arbitrary number so that
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
index f17cfbac13f..332545a99e1 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
@@ -20,6 +20,7 @@ import scala.collection.AbstractIterator
 import scala.concurrent.Future
 
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.GpuMetric.{DEBUG_LEVEL, ESSENTIAL_LEVEL, MODERATE_LEVEL}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.{GpuHashPartitioning, GpuRangePartitioning, ShimUnaryExecNode, ShuffleOriginUtil, SparkShimImpl}
 
@@ -37,6 +38,7 @@ import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.metric._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.GpuShuffleDependency
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.MutablePair
@@ -195,24 +197,11 @@ abstract class GpuShuffleExchangeExecBase(
     SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
   lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
-  override lazy val additionalMetrics : Map[String, GpuMetric] = Map(
-    // dataSize and dataReadSize are uncompressed, one is on write and the 
-    // other on read
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL,"data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL,"rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL,"rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   // Spark doesn't report totalTime for this operator so we override metrics
   override lazy val allMetrics: Map[String, GpuMetric] = Map(
@@ -233,7 +222,7 @@ abstract class GpuShuffleExchangeExecBase(
   // This value must be lazy because the child's output may not have been resolved
   // yet in all cases.
   private lazy val serializer: Serializer = new GpuColumnarBatchSerializer(
-    gpuLongMetric("dataSize"), sparkTypes, useKudo)
+    allMetrics, sparkTypes, useKudo)
 
   @transient lazy val inputBatchRDD: RDD[ColumnarBatch] = child.executeColumnar()
 
@@ -276,6 +265,66 @@ abstract class GpuShuffleExchangeExecBase(
 }
 
 object GpuShuffleExchangeExecBase {
+  val METRIC_DATA_SIZE = "dataSize"
+  val METRIC_DESC_DATA_SIZE = "data size"
+  val METRIC_DATA_READ_SIZE = "dataReadSize"
+  val METRIC_DESC_DATA_READ_SIZE = "data read size"
+  val METRIC_SHUFFLE_SERIALIZATION_TIME = "rapidsShuffleSerializationTime"
+  val METRIC_DESC_SHUFFLE_SERIALIZATION_TIME = "RAPIDS shuffle serialization time"
+  val METRIC_SHUFFLE_SER_STREAM_TIME = "rapidsShuffleSerializationStreamTime"
+  val METRIC_DESC_SHUFFLE_SER_STREAM_TIME = "RAPIDS shuffle serialization to output stream time"
+  val METRIC_SHUFFLE_DESERIALIZATION_TIME = "rapidsShuffleDeserializationTime"
+  val METRIC_DESC_SHUFFLE_DESERIALIZATION_TIME = "RAPIDS shuffle deserialization time"
+  val METRIC_SHUFFLE_DESER_STREAM_TIME = "rapidsShuffleDeserializationStreamTime"
+  val METRIC_DESC_SHUFFLE_DESER_STREAM_TIME =
+    "RAPIDS shuffle deserialization from input stream time"
+  val METRIC_SHUFFLE_PARTITION_TIME = "rapidsShufflePartitionTime"
+  val METRIC_DESC_SHUFFLE_PARTITION_TIME = "RAPIDS shuffle partition time"
+  val METRIC_SHUFFLE_WRITE_TIME = "rapidsShuffleWriteTime"
+  val METRIC_DESC_SHUFFLE_WRITE_TIME = "RAPIDS shuffle shuffle write time"
+  val METRIC_SHUFFLE_COMBINE_TIME = "rapidsShuffleCombineTime"
+  val METRIC_DESC_SHUFFLE_COMBINE_TIME = "RAPIDS shuffle shuffle combine time"
+  val METRIC_SHUFFLE_WRITE_IO_TIME = "rapidsShuffleWriteIoTime"
+  val METRIC_DESC_SHUFFLE_WRITE_IO_TIME = "RAPIDS shuffle shuffle write io time"
+  val METRIC_SHUFFLE_READ_TIME = "rapidsShuffleReadTime"
+  val METRIC_DESC_SHUFFLE_READ_TIME = "RAPIDS shuffle shuffle read time"
+  val METRIC_SHUFFLE_SER_CALC_HEADER_TIME = "rapidsShuffleSerializationCalcHeaderTime"
+  val METRIC_DESC_SHUFFLE_SER_CALC_HEADER_TIME = "RAPIDS shuffle serialization calc header time"
+  val METRIC_SHUFFLE_SER_COPY_HEADER_TIME = "rapidsShuffleSerializationCopyHeaderTime"
+  val METRIC_DESC_SHUFFLE_SER_COPY_HEADER_TIME = "RAPIDS shuffle serialization copy header time"
+  val METRIC_SHUFFLE_SER_COPY_BUFFER_TIME = "rapidsShuffleSerializationCopyBufferTime"
+  val METRIC_DESC_SHUFFLE_SER_COPY_BUFFER_TIME = "RAPIDS shuffle serialization copy buffer time"
+
+  def createAdditionalExchangeMetrics(gpu: GpuExec): Map[String, GpuMetric] = Map(
+    // dataSize and dataReadSize are uncompressed, one is on write and the other on read
+    METRIC_DATA_SIZE -> gpu.createSizeMetric(ESSENTIAL_LEVEL, METRIC_DESC_DATA_SIZE),
+    METRIC_DATA_READ_SIZE -> gpu.createSizeMetric(MODERATE_LEVEL, METRIC_DESC_DATA_READ_SIZE),
+    METRIC_SHUFFLE_SERIALIZATION_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL,METRIC_DESC_SHUFFLE_SERIALIZATION_TIME),
+    METRIC_SHUFFLE_SER_STREAM_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_STREAM_TIME),
+    METRIC_SHUFFLE_DESERIALIZATION_TIME  ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_DESERIALIZATION_TIME),
+    METRIC_SHUFFLE_DESER_STREAM_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_DESER_STREAM_TIME),
+    METRIC_SHUFFLE_PARTITION_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_PARTITION_TIME),
+    METRIC_SHUFFLE_WRITE_TIME ->
+        gpu.createNanoTimingMetric(ESSENTIAL_LEVEL, METRIC_DESC_SHUFFLE_WRITE_TIME),
+    METRIC_SHUFFLE_COMBINE_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_COMBINE_TIME),
+    METRIC_SHUFFLE_WRITE_IO_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_WRITE_IO_TIME),
+    METRIC_SHUFFLE_READ_TIME ->
+        gpu.createNanoTimingMetric(ESSENTIAL_LEVEL, METRIC_DESC_SHUFFLE_READ_TIME),
+    METRIC_SHUFFLE_SER_CALC_HEADER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_CALC_HEADER_TIME),
+    METRIC_SHUFFLE_SER_COPY_HEADER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_COPY_HEADER_TIME),
+    METRIC_SHUFFLE_SER_COPY_BUFFER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_COPY_BUFFER_TIME)
+  )
+
   def prepareBatchShuffleDependency(
       rdd: RDD[ColumnarBatch],
       outputAttributes: Seq[Attribute],
@@ -315,8 +364,11 @@ object GpuShuffleExchangeExecBase {
       rdd
     }
     val partitioner: GpuExpression = getPartitioner(newRdd, outputAttributes, newPartitioning)
+    val partitionTime: GpuMetric = metrics(METRIC_SHUFFLE_PARTITION_TIME)
     def getPartitioned: ColumnarBatch => Any = {
-      batch => partitioner.columnarEvalAny(batch)
+      batch => partitionTime.ns {
+        partitioner.columnarEvalAny(batch)
+      }
     }
     val rddWithPartitionIds: RDD[Product2[Int, ColumnarBatch]] = {
       newRdd.mapPartitions { iter =>
diff --git a/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
index 3958dce6fdb..c006031da7d 100644
--- a/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
+++ b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
@@ -113,7 +113,8 @@ class RapidsShuffleThreadedReaderSuite
       val shuffleId = 22
       val numMaps = 6
       val keyValuePairsPerMap = 10
-      val serializer = new GpuColumnarBatchSerializer(NoopMetric, Array.empty, false)
+      val serializer = new GpuColumnarBatchSerializer(Map.empty.withDefaultValue(NoopMetric),
+        Array.empty, false)
 
       // Make a mock BlockManager that will return RecordingManagedByteBuffers of data, so that we
       // can ensure retain() and release() are properly called.

From 89984d068aa749f8414be939a7f8140280a476f2 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 6 Dec 2024 10:38:39 +0800
Subject: [PATCH 26/37] Balance the pre-merge CI job's time for the ci_1 and
 ci_2 tests (#11826)

* Balance the pre-merge CI job's time for the ci_1 and ci_2 tests

To fix: https://github.com/NVIDIA/spark-rapids/issues/11825

The pre-merge CI job is divided into CI_1 (mvn_verify) and CI_2.

We run these two parts in parallel to speed up the pre-merge CI.

Currently, CI_1 takes about 2 hours, while CI_2 takes approximately 4 hours.

Mark some tests as CI_1  to balance the time between CI_1 and CI_2

After remarking tests, both CI_1 and CI_2 jobs should be finished in 3 hours or so.

Signed-off-by: timl <timl@nvidia.com>

* Separate pre-merge CI job to two parts

To balance the duration, separate pre-merge CI job to two parts:
    premergeUT1(2 shims' UT + 1/3 of the integration tests)
    premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration

Signed-off-by: timl <timl@nvidia.com>

---------

Signed-off-by: timl <timl@nvidia.com>
---
 integration_tests/src/main/python/join_test.py            | 3 ++-
 integration_tests/src/main/python/json_test.py            | 3 +++
 integration_tests/src/main/python/parquet_test.py         | 2 ++
 integration_tests/src/main/python/window_function_test.py | 3 +++
 pom.xml                                                   | 7 ++++---
 scala2.13/pom.xml                                         | 7 ++++---
 6 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 936310bedeb..96021421d62 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -22,7 +22,8 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime
 
-pytestmark = [pytest.mark.nightly_resource_consuming_test]
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]
 
 all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
 all_symmetric_sized_join_types = ['Inner', 'FullOuter']
diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
index 6e8165846e7..b825975f398 100644
--- a/integration_tests/src/main/python/json_test.py
+++ b/integration_tests/src/main/python/json_test.py
@@ -23,6 +23,9 @@
 from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
 from spark_session import *
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 TEXT_INPUT_EXEC='FileSourceScanExec'
 
 # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index 6aa234003ba..a43d48e5ce9 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -28,6 +28,8 @@
 from spark_session import *
 from conftest import is_databricks_runtime, is_dataproc_runtime
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
 
 def read_parquet_df(data_path):
     return lambda spark : spark.read.parquet(data_path)
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
index 653eaffa940..d6b51342213 100644
--- a/integration_tests/src/main/python/window_function_test.py
+++ b/integration_tests/src/main/python/window_function_test.py
@@ -24,6 +24,9 @@
 from spark_session import is_before_spark_320, is_databricks113_or_later, is_databricks133_or_later, is_spark_350_or_later, spark_version, with_cpu_session
 import warnings
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 _grpkey_longs_with_no_nulls = [
     ('a', RepeatSeqGen(LongGen(nullable=False), length=20)),
     ('b', IntegerGen()),
diff --git a/pom.xml b/pom.xml
index c5adf511d97..bcfaf4f04af 100644
--- a/pom.xml
+++ b/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 8a078e6e0d0..e0aeb7af559 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>

From d9bc056d604f979bebb725fee92bb7ff7f0f90f0 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 6 Dec 2024 14:48:43 -0600
Subject: [PATCH 27/37] Deal with spark changes fro colum<->expression
 conversions (#11827)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 .../org/apache/spark/sql/tests/datagen/DataGenExprShims.scala | 4 ++--
 .../scala/org/apache/spark/sql/nvidia/DFUDFShims.scala        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
index 2884968660d..3480718dbc7 100644
--- a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
+++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DataGenExprShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
index e67dfb450d8..8343fe5379b 100644
--- a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
+++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DFUDFShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }

From fb2f72df881582855393135d6e574111716ec7bb Mon Sep 17 00:00:00 2001
From: Kuhu Shukla <kuhus@nvidia.com>
Date: Fri, 6 Dec 2024 21:52:17 -0600
Subject: [PATCH 28/37] Orc writes don't fully support Booleans with nulls 
 (#11763)

---
 .../main/python/datasourcev2_write_test.py    |  8 +--
 .../main/python/hive_parquet_write_test.py    |  5 +-
 .../src/main/python/hive_write_test.py        | 10 +++-
 integration_tests/src/main/python/orc_test.py | 42 +++++++++++----
 .../src/main/python/orc_write_test.py         | 51 ++++++++++++++++---
 .../src/main/python/schema_evolution_test.py  | 11 ++--
 .../com/nvidia/spark/rapids/RapidsConf.scala  | 10 ++++
 .../spark/sql/rapids/GpuOrcFileFormat.scala   | 12 ++++-
 .../spark/sql/rapids/OrcFilterSuite.scala     | 49 +++++++++++++-----
 9 files changed, 156 insertions(+), 42 deletions(-)

diff --git a/integration_tests/src/main/python/datasourcev2_write_test.py b/integration_tests/src/main/python/datasourcev2_write_test.py
index 1f4bc133d2a..4fffd10ab44 100644
--- a/integration_tests/src/main/python/datasourcev2_write_test.py
+++ b/integration_tests/src/main/python/datasourcev2_write_test.py
@@ -18,7 +18,7 @@
 from data_gen import gen_df, decimal_gens, non_utc_allow
 from marks import *
 from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
-from hive_parquet_write_test import _hive_bucket_gens, _hive_array_gens, _hive_struct_gens
+from hive_parquet_write_test import _hive_bucket_gens_sans_bools, _hive_array_gens, _hive_struct_gens
 from hive_parquet_write_test import read_single_bucket
 
 _hive_write_conf = {
@@ -33,9 +33,11 @@
 @allow_non_gpu(*non_utc_allow)
 def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
     num_rows = 2048
-
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     def gen_table(spark):
-        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
         types_sql_str = ','.join('{} {}'.format(
             name, gen.data_type.simpleString()) for name, gen in gen_list)
         col_names_str = ','.join(name for name, gen in gen_list)
diff --git a/integration_tests/src/main/python/hive_parquet_write_test.py b/integration_tests/src/main/python/hive_parquet_write_test.py
index e66b889a986..540db74a1ad 100644
--- a/integration_tests/src/main/python/hive_parquet_write_test.py
+++ b/integration_tests/src/main/python/hive_parquet_write_test.py
@@ -25,9 +25,10 @@
 # "GpuInsertIntoHiveTable" for Parquet write.
 _write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False}
 
-_hive_bucket_gens = [
-    boolean_gen, byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
+_hive_bucket_gens_sans_bools = [
+    byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
     DateGen(start=date(1590, 1, 1)), _restricted_timestamp()]
+_hive_bucket_gens = [boolean_gen] + _hive_bucket_gens_sans_bools
 
 _hive_basic_gens = _hive_bucket_gens + [
     DecimalGen(precision=19, scale=1, nullable=True),
diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py
index 945cc4806fb..af825a99810 100644
--- a/integration_tests/src/main/python/hive_write_test.py
+++ b/integration_tests/src/main/python/hive_write_test.py
@@ -29,8 +29,11 @@ def _restricted_timestamp(nullable=True):
                         end=datetime(2262, 4, 11, tzinfo=timezone.utc),
                         nullable=nullable)
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-                     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+                     string_gen, DateGen(start=date(1590, 1, 1)),
                      _restricted_timestamp()
                ] + decimal_gens
 
@@ -45,8 +48,11 @@ def _restricted_timestamp(nullable=True):
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     lambda nullable=True: _restricted_timestamp(nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index 618004ee60d..19894d29aa6 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -112,8 +112,11 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
 #E                   	at org.apache.orc.TypeDescription.parseInt(TypeDescription.java:244)
 #E                   	at org.apache.orc.TypeDescription.parseType(TypeDescription.java:362)
 # ...
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 orc_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen] + decimal_gens
 
 orc_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_basic_gens)])
@@ -201,8 +204,11 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e
             read_func(data_path),
             conf=all_confs)
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 orc_pred_push_gens = [
-        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, boolean_gen,
+        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
         string_gen,
         # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with
         # date_gen
@@ -277,8 +283,11 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea
 def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
     # we should go with a more standard set of generators
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen]
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20'
@@ -344,8 +353,11 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade
 def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
     # we should go with a more standard set of generators
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen]
     first_gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0'
@@ -825,8 +837,11 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_
 @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))])
 @allow_non_gpu(*non_utc_allow_orc_scan)
 def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order):
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-                string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+                string_gen, DateGen(start=date(1590, 1, 1)),
                 orc_timestamp_gen]
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20'
@@ -927,7 +942,10 @@ def test_orc_column_name_with_dots(spark_tmp_path, reader_confs):
                 ("f.g", int_gen),
                 ("h", string_gen)])),
             ("i.j", long_gen)])),
-        ("k", boolean_gen)]
+        # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+        # https://github.com/rapidsai/cudf/issues/6763 .
+        # Once the first issue is fixed, add back boolean_gen for column k
+        ("k", int_gen)]
     with_cpu_session(lambda spark: gen_df(spark, gens).write.orc(data_path))
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`"), conf=all_confs)
@@ -945,7 +963,10 @@ def test_orc_with_null_column(spark_tmp_path, reader_confs):
     def gen_null_df(spark):
         return spark.createDataFrame(
             [(None, None, None, None, None)],
-            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+            # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+            # https://github.com/rapidsai/cudf/issues/6763 .
+            # Once the first issue is fixed, add back boolean_gen
+            "c1 int, c2 long, c3 float, c4 double, c5 int")
 
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path: gen_null_df(spark).write.orc(path),
@@ -966,7 +987,10 @@ def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
     def gen_null_df(spark):
         return spark.createDataFrame(
             data,
-            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+            # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+            # https://github.com/rapidsai/cudf/issues/6763 .
+            # Once the first issue is fixed, add back boolean_gen
+            "c1 int, c2 long, c3 float, c4 double, c5 int")
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path: gen_null_df(spark).write.orc(path),
         lambda spark, path: spark.read.orc(path),
diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
index ddb69524ac4..7e415c79a46 100644
--- a/integration_tests/src/main/python/orc_write_test.py
+++ b/integration_tests/src/main/python/orc_write_test.py
@@ -24,9 +24,11 @@
 from pyspark.sql.types import *
 
 pytestmark = pytest.mark.nightly_resource_consuming_test
-
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen.
 orc_write_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-        string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+        string_gen, DateGen(start=date(1590, 1, 1)),
         TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc)) ] + \
         decimal_gens
 
@@ -52,7 +54,8 @@
         all_nulls_map_gen,
         all_empty_map_gen]
 
-orc_write_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)])
+orc_write_basic_struct_gen = StructGen(
+    [['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)])
 
 orc_write_struct_gens_sample = [orc_write_basic_struct_gen,
     StructGen([['child0', byte_gen], ['child1', orc_write_basic_struct_gen]]),
@@ -62,15 +65,18 @@
     ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10),
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
-
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen.
 orc_write_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     # Using timestamps from 1970 to work around a cudf ORC bug
     # https://github.com/NVIDIA/spark-rapids/issues/140.
     lambda nullable=True: TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc), nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
-    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]]
+    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + [MapGen(
+    f(nullable=False), f(nullable=False)) for f in [IntegerGen]]
 
 orc_write_gens_list = [orc_write_basic_gens,
         orc_write_struct_gens_sample,
@@ -79,6 +85,7 @@
         pytest.param([date_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/139')),
         pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/140'))]
 
+bool_gen = [BooleanGen(nullable=True), BooleanGen(nullable=False)]
 @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 @allow_non_gpu(*non_utc_allow)
@@ -91,6 +98,30 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
             data_path,
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
+@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+@allow_non_gpu('ExecutedCommandExec', 'DataWritingCommandExec', 'WriteFilesExec')
+def test_write_round_trip_bools_only_fallback(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
+
+@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+def test_write_round_trip_bools_only_no_fallback(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True,
+              'spark.rapids.sql.format.orc.write.boolType.enabled': True})
+
 @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
@@ -103,7 +134,8 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
 orc_part_write_gens = [
-        byte_gen, short_gen, int_gen, long_gen, boolean_gen,
+        # Add back boolean_gen when  https://github.com/rapidsai/cudf/issues/6763 is fixed
+        byte_gen, short_gen, int_gen, long_gen,
         # Some file systems have issues with UTF8 strings so to help the test pass even there
         StringGen('(\\w| ){0,50}'),
         # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with
@@ -345,7 +377,10 @@ def test_orc_write_column_name_with_dots(spark_tmp_path):
                 ("f.g", int_gen),
                 ("h", string_gen)])),
             ("i.j", long_gen)])),
-        ("k", boolean_gen)]
+        # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+        # https://github.com/rapidsai/cudf/issues/6763 .
+        # Once the first issue is fixed, add back boolean_gen for column k
+        ("k", int_gen)]
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path:  gen_df(spark, gens).coalesce(1).write.orc(path),
         lambda spark, path: spark.read.orc(path),
diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py
index ff501324cc0..57af4a1126e 100644
--- a/integration_tests/src/main/python/schema_evolution_test.py
+++ b/integration_tests/src/main/python/schema_evolution_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,9 @@
 
 # List of additional column data generators to use when adding columns
 _additional_gens = [
-    boolean_gen,
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     byte_gen,
     short_gen,
     int_gen,
@@ -49,7 +51,10 @@
     # simple_string_to_string_map_gen),
     ArrayGen(_custom_date_gen),
     struct_gen_decimal128,
-    StructGen([("c0", ArrayGen(long_gen)), ("c1", boolean_gen)]),
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen from int_gen for c1
+    StructGen([("c0", ArrayGen(long_gen)), ("c1", int_gen)]),
 ]
 
 def get_additional_columns():
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 406aeb0365b..e750f5688ce 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1268,6 +1268,14 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .booleanConf
     .createWithDefault(true)
 
+  val ENABLE_ORC_BOOL = conf("spark.rapids.sql.format.orc.write.boolType.enabled")
+    .doc("When set to false disables boolean columns for ORC writes. " +
+      "Set to true if you want to experiment. " +
+      "See https://github.com/NVIDIA/spark-rapids/issues/11736.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ENABLE_EXPAND_PREPROJECT = conf("spark.rapids.sql.expandPreproject.enabled")
     .doc("When set to false disables the pre-projection for GPU Expand. " +
       "Pre-projection leverages the tiered projection to evaluate expressions that " +
@@ -3028,6 +3036,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val maxNumOrcFilesParallel: Int = get(ORC_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
 
+  lazy val isOrcBoolTypeEnabled: Boolean = get(ENABLE_ORC_BOOL)
+
   lazy val isCsvEnabled: Boolean = get(ENABLE_CSV)
 
   lazy val isCsvReadEnabled: Boolean = get(ENABLE_CSV_READ)
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
index d2f4380646c..1d4bc66a1da 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.FileFormat
 import org.apache.spark.sql.execution.datasources.orc.{OrcFileFormat, OrcOptions, OrcUtils}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.execution.TrampolineUtil
 import org.apache.spark.sql.types._
 
 object GpuOrcFileFormat extends Logging {
@@ -83,6 +84,11 @@ object GpuOrcFileFormat extends Logging {
     // [[org.apache.spark.sql.execution.datasources.DaysWritable]] object
     // which is a subclass of [[org.apache.hadoop.hive.serde2.io.DateWritable]].
     val types = schema.map(_.dataType).toSet
+    val hasBools = schema.exists { field =>
+      TrampolineUtil.dataTypeExistsRecursively(field.dataType, t =>
+        t.isInstanceOf[BooleanType])
+    }
+
     if (types.exists(GpuOverrides.isOrContainsDateOrTimestamp(_))) {
       if (!GpuOverrides.isUTCTimezone()) {
         meta.willNotWorkOnGpu("Only UTC timezone is supported for ORC. " +
@@ -91,6 +97,10 @@ object GpuOrcFileFormat extends Logging {
       }
     }
 
+    if (hasBools && !meta.conf.isOrcBoolTypeEnabled) {
+      meta.willNotWorkOnGpu("Nullable Booleans can not work in certain cases with ORC writer." +
+        "See https://github.com/rapidsai/cudf/issues/6763")
+    }
     FileFormatChecks.tag(meta, schema, OrcFormatType, WriteFileOp)
 
     val sqlConf = spark.sessionState.conf
diff --git a/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala b/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
index fe86900b32f..6d067800dde 100644
--- a/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
+++ b/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@ package org.apache.spark.sql.rapids
 
 import java.sql.Timestamp
 
-import com.nvidia.spark.rapids.{GpuFilterExec, SparkQueryCompareTestSuite}
+import com.nvidia.spark.rapids.{GpuFilterExec, RapidsConf, SparkQueryCompareTestSuite}
 
+import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.FilterExec
+import org.apache.spark.sql.execution.{FilterExec, SparkPlan}
 
 class OrcFilterSuite extends SparkQueryCompareTestSuite {
 
@@ -39,22 +40,42 @@ class OrcFilterSuite extends SparkQueryCompareTestSuite {
 
   test("Support for pushing down filters for boolean types gpu write gpu read") {
     withTempPath { file =>
-      withGpuSparkSession(spark => {
-        val data = (0 until 10).map(i => Tuple1(i == 2))
-        val df = spark.createDataFrame(data).toDF("a")
-        df.repartition(10).write.orc(file.getCanonicalPath)
-        checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
-      })
+      var gpuPlans: Array[SparkPlan] = Array.empty
+      val testConf = new SparkConf().set(
+        RapidsConf.TEST_ALLOWED_NONGPU.key,
+        "DataWritingCommandExec,ShuffleExchangeExec, WriteFilesExec")
+      ExecutionPlanCaptureCallback.startCapture()
+      try {
+        withGpuSparkSession(spark => {
+          val data = (0 until 10).map(i => Tuple1(i == 2))
+          val df = spark.createDataFrame(data).toDF("a")
+          df.repartition(10).write.orc(file.getCanonicalPath)
+          checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
+        }, testConf)
+      } finally {
+          gpuPlans = ExecutionPlanCaptureCallback.getResultsWithTimeout()
+      }
+      ExecutionPlanCaptureCallback.assertDidFallBack(gpuPlans.head, "DataWritingCommandExec")
     }
   }
 
   test("Support for pushing down filters for boolean types gpu write cpu read") {
     withTempPath { file =>
-      withGpuSparkSession(spark => {
-        val data = (0 until 10).map(i => Tuple1(i == 2))
-        val df = spark.createDataFrame(data).toDF("a")
-        df.repartition(10).write.orc(file.getCanonicalPath)
-      })
+      var gpuPlans: Array[SparkPlan] = Array.empty
+      val testConf = new SparkConf().set(
+        RapidsConf.TEST_ALLOWED_NONGPU.key,
+        "DataWritingCommandExec,ShuffleExchangeExec, WriteFilesExec")
+      ExecutionPlanCaptureCallback.startCapture()
+      try {
+        withGpuSparkSession(spark => {
+          val data = (0 until 10).map(i => Tuple1(i == 2))
+          val df = spark.createDataFrame(data).toDF("a")
+          df.repartition(10).write.orc(file.getCanonicalPath)
+        }, testConf)
+      } finally {
+          gpuPlans = ExecutionPlanCaptureCallback.getResultsWithTimeout()
+        }
+      ExecutionPlanCaptureCallback.assertDidFallBack(gpuPlans.head, "DataWritingCommandExec")
       withCpuSparkSession(spark => {
         checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
       })

From 0fe162d7f568b86979185e48b46f211401059c9f Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Sat, 7 Dec 2024 14:42:02 +0800
Subject: [PATCH 29/37] Some minor improvements identified during benchmark
 (#11829)

* Some minor improvements identified during benchmark

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>

* Fix late initialization

---------

Signed-off-by: liurenjie1024 <liurenjie2008@gmail.com>
---
 .../rapids/GpuColumnarBatchSerializer.scala   | 25 +++++++++++++------
 .../spark/rapids/GpuShuffleCoalesceExec.scala |  4 ++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 82f368936a7..5a639baafec 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -132,9 +132,18 @@ class SerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
 class GpuColumnarBatchSerializer(metrics: Map[String, GpuMetric], dataTypes: Array[DataType],
     useKudo: Boolean)
   extends Serializer with Serializable {
+
+  private lazy val kudo = {
+    if (useKudo && dataTypes.nonEmpty) {
+      Some(new KudoSerializer(GpuColumnVector.from(dataTypes)))
+    } else {
+      None
+    }
+  }
+
   override def newInstance(): SerializerInstance = {
     if (useKudo) {
-      new KudoSerializerInstance(metrics, dataTypes)
+      new KudoSerializerInstance(metrics, dataTypes, kudo)
     } else {
       new GpuColumnarBatchSerializerInstance(metrics)
     }
@@ -158,7 +167,7 @@ private class GpuColumnarBatchSerializerInstance(metrics: Map[String, GpuMetric]
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
-      val toClose = new ArrayBuffer[AutoCloseable]()
+      val toClose = new ArrayBuffer[AutoCloseable](numColumns)
       try {
         var startRow = 0
         val numRows = batch.numRows()
@@ -338,7 +347,9 @@ object SerializedTableColumn {
  */
 private class KudoSerializerInstance(
     val metrics: Map[String, GpuMetric],
-    val dataTypes: Array[DataType]) extends SerializerInstance {
+    val dataTypes: Array[DataType],
+    val kudo: Option[KudoSerializer]
+) extends SerializerInstance {
   private val dataSize = metrics(METRIC_DATA_SIZE)
   private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
   private val serCalcHeaderTime = metrics(METRIC_SHUFFLE_SER_CALC_HEADER_TIME)
@@ -346,8 +357,6 @@ private class KudoSerializerInstance(
   private val serCopyBufferTime = metrics(METRIC_SHUFFLE_SER_COPY_BUFFER_TIME)
   private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)
 
-  private lazy val kudo = new KudoSerializer(GpuColumnVector.from(dataTypes))
-
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] val dOut: DataOutputStream =
       new DataOutputStream(new BufferedOutputStream(out))
@@ -356,7 +365,7 @@ private class KudoSerializerInstance(
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
-      withResource(new ArrayBuffer[AutoCloseable]()) { toClose =>
+      withResource(new ArrayBuffer[AutoCloseable](numColumns)) { toClose =>
         var startRow = 0
         val numRows = batch.numRows()
         if (batch.numCols() > 0) {
@@ -384,7 +393,9 @@ private class KudoSerializerInstance(
           }
 
           withResource(new NvtxRange("Serialize Batch", NvtxColor.YELLOW)) { _ =>
-            val writeMetric = kudo.writeToStreamWithMetrics(columns, dOut, startRow, numRows)
+            val writeMetric = kudo
+              .getOrElse(throw new IllegalStateException("Kudo serializer not initialized."))
+              .writeToStreamWithMetrics(columns, dOut, startRow, numRows)
 
             dataSize += writeMetric.getWrittenBytes
             serCalcHeaderTime += writeMetric.getCalcHeaderTime
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
index c33c19cdd8a..62b8e36be65 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
@@ -237,6 +237,7 @@ class KudoTableOperator(
     kudoMergeHeaderTime: GpuMetric,
     kudoMergeBufferTime: GpuMetric) extends SerializedTableOperator[KudoSerializedTableColumn] {
   require(kudo != null, "kudo serializer should not be null")
+  private val kudoTables = new util.ArrayList[KudoTable]()
 
   override def getDataLen(column: KudoSerializedTableColumn): Long = column.kudoTable.getHeader
     .getTotalDataLen
@@ -251,7 +252,8 @@ class KudoTableOperator(
       val totalRowsNum = columns.map(getNumRows).sum
       RowCountOnlyMergeResult(totalRowsNum)
     } else {
-      val kudoTables = new util.ArrayList[KudoTable](columns.length)
+      kudoTables.clear()
+      kudoTables.ensureCapacity(columns.length)
       columns.foreach { column =>
         kudoTables.add(column.kudoTable)
       }

From 3449c8a772899ff1752ea42efa89f1ddc049cc6a Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sun, 8 Dec 2024 15:45:53 -0600
Subject: [PATCH 30/37] Fixes a leak for the empty nlj iterator (#11832)

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 .../GpuBroadcastNestedLoopJoinExecBase.scala         | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
index 578c1106eb1..b939a8c4155 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
@@ -658,11 +658,15 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
 
             localJoinType match {
               case LeftOuter if spillableBuiltBatch.numRows == 0 =>
-                new EmptyOuterNestedLoopJoinIterator(streamedIter, spillableBuiltBatch.dataTypes,
-                  true)
+                withResource(spillableBuiltBatch) { _ =>
+                  new EmptyOuterNestedLoopJoinIterator(streamedIter, spillableBuiltBatch.dataTypes,
+                    true)
+                }
               case RightOuter if spillableBuiltBatch.numRows == 0 =>
-                new EmptyOuterNestedLoopJoinIterator(streamedIter, spillableBuiltBatch.dataTypes,
-                  false)
+                withResource(spillableBuiltBatch) { _ =>
+                  new EmptyOuterNestedLoopJoinIterator(streamedIter, spillableBuiltBatch.dataTypes,
+                    false)
+                }
               case _ =>
                 new CrossJoinIterator(
                   spillableBuiltBatch,

From d1466b778c6f111f0e6b2f424c8c1d31c72df2b0 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Mon, 9 Dec 2024 09:59:51 +0800
Subject: [PATCH 31/37] Optimize Databricks Jenkins scripts [skip ci] (#11817)

* Optimize Databricks Jenkins scripts

Remove duplicate try/catch/container script blocks

Move default Databricks parameters into the common Groovy library

Signed-off-by: timl <timl@nvidia.com>

* Fix merge conflict

Fix merge conflict with https://github.com/NVIDIA/spark-rapids/pull/11819/files#diff-6c8e5cceR72

Signed-off-by: Tim Liu <timl@nvidia.com>

---------

Signed-off-by: timl <timl@nvidia.com>
Signed-off-by: Tim Liu <timl@nvidia.com>
---
 .../Jenkinsfile-blossom.premerge-databricks   | 90 +++++++------------
 1 file changed, 30 insertions(+), 60 deletions(-)

diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index 147b6a40e98..abf799371b3 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -68,7 +68,6 @@ pipeline {
         DATABRICKS_PUBKEY = credentials("SPARK_DATABRICKS_PUBKEY")
         DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
         DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
-        INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
         TEST_TYPE = 'pre-commit'
     }
 
@@ -111,12 +110,16 @@ pipeline {
                             BASE_SPARK_VERSION = DbUtils.getSparkVer("$DB_RUNTIME")
                             BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS = DbUtils.getInstallVer("$DB_RUNTIME")
                             INIT_SCRIPTS = DbUtils.getInitScripts("$DB_RUNTIME")
+                            INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}-${DB_RUNTIME}"
+                            EXTRA_ENVS = "TEST_MODE=$TEST_MODE"
                         }
                         steps {
                             script {
-                                unstash('source_tree')
-                                databricksBuild()
-                                deleteDir() // cleanup content if no error
+                                container('cpu') {
+                                    unstash('source_tree')
+                                    databricksBuild()
+                                    deleteDir() // cleanup content if no error
+                                }
                             }
                         }
                     }
@@ -134,79 +137,46 @@ String getDbType() {
 void databricksBuild() {
     def CLUSTER_ID = ''
     def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
-    def ws_path = "$INIT_SCRIPTS_DIR-$DB_TYPE"
+    def dbStep = ''
     try {
         stage("Create $SPARK_MAJOR DB") {
-            script {
-                container('cpu') {
-                    sh "rm -rf spark-rapids-ci.tgz"
-                    sh "tar -zcf spark-rapids-ci.tgz *"
-                    def CREATE_PARAMS = " -r $DATABRICKS_RUNTIME -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN" +
-                            " -s $DB_TYPE -n CI-${BUILD_TAG}-${BASE_SPARK_VERSION} -k \"$DATABRICKS_PUBKEY\" -i $IDLE_TIMEOUT" +
-                            " -d $DATABRICKS_DRIVER -o $DATABRICKS_WORKER -e $NUM_WORKERS"
-
-                    // handle init scripts if exist
-                    if (env.INIT_SCRIPTS) {
-                        // foo.sh,bar.sh --> /path/foo.sh,/path/bar.sh
-                        CREATE_PARAMS += " -f " + DbUtils.uploadFiles(this, env.INIT_SCRIPTS, ws_path)
-                    }
-
-                    CLUSTER_ID = sh(script: "python3 ./jenkins/databricks/create.py $CREATE_PARAMS",
-                            returnStdout: true).trim()
-                    echo CLUSTER_ID
-                }
-            }
+            dbStep = 'CREATE'
+            // Add the init_script parameter, e.g. oo.sh,bar.sh --> /path/foo.sh,/path/bar.sh
+            def input_params = env.INIT_SCRIPTS ? " -f " + DbUtils.uploadFiles(this, env.INIT_SCRIPTS, env.INIT_SCRIPTS_DIR) : ''
+            def CREATE_PARAMS = DbUtils.getParameters(this, dbStep, input_params)
+            CLUSTER_ID = sh(script: "python3 ./jenkins/databricks/create.py $CREATE_PARAMS", returnStdout: true).trim()
+            echo CLUSTER_ID
         }
 
         stage("Build against $SPARK_MAJOR DB") {
-            script {
-                container('cpu') {
-                    withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
-                        def BUILD_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -z ./spark-rapids-ci.tgz" +
-                                " -p $DATABRICKS_PRIVKEY -l ./jenkins/databricks/build.sh -d /home/ubuntu/build.sh" +
-                                " -v $BASE_SPARK_VERSION -i $BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS"
-
-                        //  add retry for build step to try
-                        //  mitigate the issue of downloading dependencies while maven/sonatype is quite unstable
-                        retry(3) {
-                            sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS"
-                        }
-                    }
+            sh "rm -rf spark-rapids-ci.tgz"
+            sh "tar -zcf spark-rapids-ci.tgz * .git"
+            dbStep = 'BUILD'
+            withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
+                def BUILD_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
+                retry(3) {
+                    sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS"
                 }
             }
+            sh "rm spark-rapids-ci.tgz"
         }
 
         // TODO: Temporarily skip tests on Databricks 14.3 until the test failures are fixed
         if (env.DB_RUNTIME != '14.3') {
             stage("Test agaist $SPARK_MAJOR DB") {
-                script {
-                    container('cpu') {
-                        try {
-                            withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
-                                def TEST_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID  -e TEST_MODE=$TEST_MODE" +
-                                    " -p $DATABRICKS_PRIVKEY -l ./jenkins/databricks/test.sh -v $BASE_SPARK_VERSION -d /home/ubuntu/test.sh"
-                                if (params.SPARK_CONF) {
-                                    TEST_PARAMS += " -f ${params.SPARK_CONF}"
-                                }
-                                sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS"
-                            }
-                        } finally {
-                            common.publishPytestResult(this, "${STAGE_NAME}")
-                        }
-                    }
+                dbStep = 'TEST'
+                withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
+                    def TEST_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
+                    sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS"
                 }
             }
         }
-
     } finally {
         if (CLUSTER_ID) {
-            container('cpu') {
-                retry(3) {
-                    if (env.INIT_SCRIPTS) {
-                        DbUtils.cleanUp(this, ws_path)
-                    }
-                    sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
-                }
+            (dbStep == 'TEST') ? common.publishPytestResult(this, "Test against $SPARK_MAJOR DB") : ''
+            retry(3) {
+                env.INIT_SCRIPTS ? DbUtils.cleanUp(this, env.INIT_SCRIPTS_DIR) : ''
+                sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
             }
         }
     }

From 0dbef90ffd55de5099ba3e9f3de89a7d159d3e0a Mon Sep 17 00:00:00 2001
From: YanxuanLiu <yanxuanl@nvidia.com>
Date: Mon, 9 Dec 2024 13:33:37 +0800
Subject: [PATCH 32/37] Fix bug: populate cache deps [skip ci] (#11811)

* correct arg of get_buildvers.py

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* output fail info

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* fail the script when error occur

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* test error

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* test error

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

* split command to avoid masking error

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>

---------

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>
---
 .../mvn-verify-check/populate-daily-cache.sh  | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh
index b93cd0b6b49..d4e9b07d1a7 100755
--- a/.github/workflows/mvn-verify-check/populate-daily-cache.sh
+++ b/.github/workflows/mvn-verify-check/populate-daily-cache.sh
@@ -14,22 +14,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -x
-max_retry=3; delay=30; i=1
+set -e
+set -o pipefail
+
 if [[ $SCALA_VER == '2.12' ]]; then
     pom='pom.xml'
 elif [[ $SCALA_VER == '2.13' ]]; then
     pom='scala2.13/pom.xml'
 fi
+
+max_retry=3; delay=30; i=1
 while true; do
+    buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) &&
     {
-        python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \
-            xargs -n 1 -I {} bash -c \
-                "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies"
-
+        for buildver in "${buildvers[@]}"; do
+            mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies
+        done
+    } && {
         # compile base versions to cache scala compiler and compiler bridge
-        mvn $COMMON_MVN_FLAGS --file $pom \
-            process-test-resources -pl sql-plugin-api -am
+        mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am
     } && break || {
     if [[ $i -le $max_retry ]]; then
         echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2))
@@ -37,4 +40,4 @@ while true; do
         echo "mvn command failed. Exit 1"; exit 1
     fi
 }
-done
\ No newline at end of file
+done

From 45cdac34667638b4d29e0ec5aab663d2588e3f26 Mon Sep 17 00:00:00 2001
From: MithunR <mithunr@nvidia.com>
Date: Mon, 9 Dec 2024 10:10:03 -0800
Subject: [PATCH 33/37] Fix for lead/lag window test failures. (#11823)

Fixes #11807.

`test_lead_lag_for_structs_with_arrays` in `window_function_test` fails intermittently because of
non-deterministic data ordering.
Window function tests are sensitive to data ordering.  With certain values of DATAGEN_SEED,
there are repeated values of partitioning/ordering keys, causing the window function
to return different values on CPU and GPU.

This commit fixes the test so that the ordering is deterministic.

Signed-off-by: MithunR <mithunr@nvidia.com>
---
 .../src/main/python/window_function_test.py            | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
index 653eaffa940..7695c1adc9d 100644
--- a/integration_tests/src/main/python/window_function_test.py
+++ b/integration_tests/src/main/python/window_function_test.py
@@ -971,14 +971,12 @@ def do_it(spark):
 def test_lead_lag_for_structs_with_arrays(a_b_gen, struct_gen):
     data_gen = [
         ('a', RepeatSeqGen(a_b_gen, length=20)),
-        ('b', IntegerGen(nullable=False, special_cases=[])),
+        ('b', UniqueLongGen(nullable=False)),
         ('c', struct_gen)]
-    # By default for many operations a range of unbounded to unbounded is used
-    # This will not work until https://github.com/NVIDIA/spark-rapids/issues/216
-    # is fixed.
+    # For many operations, a range of unbounded to unbounded is used by default.
 
-    # Ordering needs to include c because with nulls and especially on booleans
-    # it is possible to get a different ordering when it is ambiguous.
+    # Ordering needs to include `b` because with nulls and especially on booleans,
+    # it is possible to get a different result when the ordering is ambiguous.
     base_window_spec = Window.partitionBy('a').orderBy('b')
 
     def do_it(spark):

From 96a58d121a5af7ef956196a6141fe7777277e95e Mon Sep 17 00:00:00 2001
From: Kuhu Shukla <kuhus@nvidia.com>
Date: Mon, 9 Dec 2024 19:47:01 -0600
Subject: [PATCH 34/37] Fix leak in isTimeStamp (#11845)

Signed-off-by: Kuhu Shukla <kuhus@nvidia.com>
---
 .../org/apache/spark/sql/rapids/datetimeExpressions.scala     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 0f382a7b6e6..d08c598cba4 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -707,7 +707,9 @@ object GpuToTimestamp {
       case _ =>
         // this is the incompatibleDateFormats case where we do not guarantee compatibility with
         // Spark and assume that all non-null inputs are valid
-        ColumnVector.fromScalar(Scalar.fromBool(true), col.getRowCount.toInt)
+        withResource(Scalar.fromBool(true)) { s =>
+          ColumnVector.fromScalar(s, col.getRowCount.toInt)
+        }
     }
   }
 

From 2999a1402b74d6dbe05bd59353e1535afdfcec6b Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Wed, 11 Dec 2024 09:44:58 +0800
Subject: [PATCH 35/37] Some small refactors to improve the stability. (#11847)

This PR is addressing some issues found during my local split-retry triage, to try to improve the stability.

It includes:

- replacing map with safeMap for the conversion between Table and ColumnarBatch.
- reducing the GPU peak memory by closing the unnecessary batches as soon as possbile in Generate exec.
- adding the retry support for Table splitting operation in Gpu write.
- eliminating a potential memory leak in BroadcastNestedLoop join.

The existing tests should already cover these changes.

Signed-off-by: Firestarman <firestarmanllc@gmail.com>
---
 .../spark/rapids/GpuAggregateExec.scala       |  2 +-
 .../spark/rapids/GpuCoalesceBatches.scala     |  2 +-
 .../nvidia/spark/rapids/GpuGenerateExec.scala | 31 ++++++-----
 .../sql/rapids/GpuFileFormatDataWriter.scala  | 51 +++++++++----------
 .../GpuBroadcastNestedLoopJoinExecBase.scala  | 21 ++++----
 5 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
index d5bbe15209d..e3ca330b409 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuAggregateExec.scala
@@ -716,7 +716,7 @@ object GpuAggregateIterator extends Logging {
             val dataTypes = (0 until numCols).map {
               c => batchesToConcat.head.column(c).dataType
             }.toArray
-            withResource(batchesToConcat.map(GpuColumnVector.from)) { tbl =>
+            withResource(batchesToConcat.safeMap(GpuColumnVector.from)) { tbl =>
               withResource(cudf.Table.concatenate(tbl: _*)) { concatenated =>
                 val cb = GpuColumnVector.from(concatenated, dataTypes)
                 SpillableColumnarBatch(cb,
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index cc1196d44e4..0af5baa90db 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -67,7 +67,7 @@ object ConcatAndConsumeAll {
     if (arrayOfBatches.length == 1) {
       arrayOfBatches(0)
     } else {
-      val tables = arrayOfBatches.map(GpuColumnVector.from)
+      val tables = arrayOfBatches.safeMap(GpuColumnVector.from)
       try {
         val combined = Table.concatenate(tables: _*)
         try {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
index 239b7a3d4c0..dbf2e95dbf0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuGenerateExec.scala
@@ -677,27 +677,26 @@ abstract class GpuExplodeBase extends GpuUnevaluableUnaryExpression with GpuGene
       outer: Boolean): Iterator[ColumnarBatch] = {
     val batchesToGenerate = inputSpillables.map(new BatchToGenerate(0, _))
     withRetry(batchesToGenerate, generateSplitSpillableInHalfByRows(generatorOffset)) { attempt =>
-      withResource(attempt.spillable.getColumnarBatch()) { inputBatch =>
+      val (exploded, schema) = withResource(attempt.spillable.getColumnarBatch()) { inputBatch =>
         require(inputBatch.numCols() - 1 == generatorOffset,
           s"Internal Error ${getClass.getSimpleName} supports one and only one input attribute.")
         val schema = resultSchema(GpuColumnVector.extractTypes(inputBatch), generatorOffset)
-
         withResource(GpuColumnVector.from(inputBatch)) { table =>
-          withResource(
-            explodeFun(table, generatorOffset, outer, attempt.fixUpOffset)) { exploded =>
-            child.dataType match {
-              case _: ArrayType =>
-                GpuColumnVector.from(exploded, schema)
-              case MapType(kt, vt, _) =>
-                // We need to pull the key and value of of the struct column
-                withResource(convertMapOutput(exploded, generatorOffset, kt, vt, outer)) { fixed =>
-                  GpuColumnVector.from(fixed, schema)
-                }
-              case other =>
-                throw new IllegalArgumentException(
-                  s"$other is not supported as explode input right now")
+          (explodeFun(table, generatorOffset, outer, attempt.fixUpOffset), schema)
+        }
+      }
+      withResource(exploded) { _ =>
+        child.dataType match {
+          case _: ArrayType =>
+            GpuColumnVector.from(exploded, schema)
+          case MapType(kt, vt, _) =>
+            // We need to pull the key and value of of the struct column
+            withResource(convertMapOutput(exploded, generatorOffset, kt, vt, outer)) { fixed =>
+              GpuColumnVector.from(fixed, schema)
             }
-          }
+          case other =>
+            throw new IllegalArgumentException(
+              s"$other is not supported as explode input right now")
         }
       }
     }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
index be88e7a2937..6b6580df68f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriter.scala
@@ -78,34 +78,35 @@ object GpuFileFormatDataWriter {
    * The input batch is closed in case of error or in case we have to split it.
    * It is not closed if it wasn't split.
    *
-   * @param batch ColumnarBatch to split (and close)
+   * @param scb Spillable ColumnarBatch to split (and close)
    * @param maxRecordsPerFile max rowcount per file
    * @param recordsInFile row count in the file so far
    * @return array of SpillableColumnarBatch splits
    */
-  def splitToFitMaxRecordsAndClose(
-      batch: ColumnarBatch,
+  def splitToFitMaxRecordsAndCloseWithRetry(
+      scb: SpillableColumnarBatch,
       maxRecordsPerFile: Long,
       recordsInFile: Long): Array[SpillableColumnarBatch] = {
-    val (types, splitIndexes) = closeOnExcept(batch) { _ =>
-      val splitIndexes = getSplitIndexes(maxRecordsPerFile, recordsInFile, batch.numRows())
-      (GpuColumnVector.extractTypes(batch), splitIndexes)
+    val splitIndexes = closeOnExcept(scb) { _ =>
+      getSplitIndexes(maxRecordsPerFile, recordsInFile, scb.numRows())
     }
     if (splitIndexes.isEmpty) {
-      // this should never happen, as `splitToFitMaxRecordsAndClose` is called when
+      // this should never happen, as `splitToFitMaxRecordsAndCloseWithRetry` is called when
       // splits should already happen, but making it more efficient in that case
-      Array(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+      Array(scb)
     } else {
       // actually split it
-      val tbl = withResource(batch) { _ =>
-        GpuColumnVector.from(batch)
-      }
-      val cts = withResource(tbl) { _ =>
-        tbl.contiguousSplit(splitIndexes: _*)
-      }
-      withResource(cts) { _ =>
-        cts.safeMap(ct =>
-          SpillableColumnarBatch(ct, types, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+      withRetryNoSplit(scb) { _ =>
+        val tbl = withResource(scb.getColumnarBatch()) { batch =>
+          GpuColumnVector.from(batch)
+        }
+        val cts = withResource(tbl) { _ =>
+          tbl.contiguousSplit(splitIndexes: _*)
+        }
+        withResource(cts) { _ =>
+          cts.safeMap(ct =>
+            SpillableColumnarBatch(ct, scb.dataTypes, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+        }
       }
     }
   }
@@ -257,14 +258,13 @@ class GpuSingleDirectoryDataWriter(
   override def write(batch: ColumnarBatch): Unit = {
     val maxRecordsPerFile = description.maxRecordsPerFile
     val recordsInFile = currentWriterStatus.recordsInFile
+    val scb = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
     if (!shouldSplitToFitMaxRecordsPerFile(
-        maxRecordsPerFile, recordsInFile, batch.numRows())) {
-      writeUpdateMetricsAndClose(
-        SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY),
-        currentWriterStatus)
+        maxRecordsPerFile, recordsInFile, scb.numRows())) {
+      writeUpdateMetricsAndClose(scb, currentWriterStatus)
     } else {
-      val partBatches = splitToFitMaxRecordsAndClose(
-        batch, maxRecordsPerFile, recordsInFile)
+      val partBatches = splitToFitMaxRecordsAndCloseWithRetry(scb, maxRecordsPerFile,
+        recordsInFile)
       val needNewWriterForFirstPart = recordsInFile >= maxRecordsPerFile
       closeOnExcept(partBatches) { _ =>
         partBatches.zipWithIndex.foreach { case (partBatch, partIx) =>
@@ -593,10 +593,7 @@ class GpuDynamicPartitionDataSingleWriter(
     if (!shouldSplitToFitMaxRecordsPerFile(maxRecordsPerFile, recordsInFile, scb.numRows())) {
       writeUpdateMetricsAndClose(scb, writerStatus)
     } else {
-      val batch = withRetryNoSplit(scb) { scb =>
-        scb.getColumnarBatch()
-      }
-      val splits = splitToFitMaxRecordsAndClose(batch, maxRecordsPerFile, recordsInFile)
+      val splits = splitToFitMaxRecordsAndCloseWithRetry(scb, maxRecordsPerFile, recordsInFile)
       withResource(splits) { _ =>
         val needNewWriterForFirstPart = recordsInFile >= maxRecordsPerFile
         splits.zipWithIndex.foreach { case (part, partIx) =>
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
index b939a8c4155..21166437d36 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastNestedLoopJoinExecBase.scala
@@ -700,17 +700,18 @@ abstract class GpuBroadcastNestedLoopJoinExecBase(
         val spillable = SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY)
         withRetry(spillable, RmmRapidsRetryIterator.splitSpillableInHalfByRows) { spillBatch =>
           withResource(spillBatch.getColumnarBatch()) { batch =>
-            GpuColumnVector.incRefCounts(batch)
-            val newCols = new Array[ColumnVector](batch.numCols + 1)
-            (0 until newCols.length - 1).foreach { i =>
-              newCols(i) = batch.column(i)
-            }
-            val existsCol = withResource(Scalar.fromBool(exists)) { existsScalar =>
-              GpuColumnVector.from(cudf.ColumnVector.fromScalar(existsScalar, batch.numRows),
-                BooleanType)
+            closeOnExcept(GpuColumnVector.incRefCounts(batch)) { _ =>
+              val newCols = new Array[ColumnVector](batch.numCols + 1)
+              (0 until newCols.length - 1).foreach { i =>
+                newCols(i) = batch.column(i)
+              }
+              val existsCol = withResource(Scalar.fromBool(exists)) { existsScalar =>
+                GpuColumnVector.from(cudf.ColumnVector.fromScalar(existsScalar, batch.numRows),
+                  BooleanType)
+              }
+              newCols(batch.numCols) = existsCol
+              new ColumnarBatch(newCols, batch.numRows)
             }
-            newCols(batch.numCols) = existsCol
-            new ColumnarBatch(newCols, batch.numRows)
           }
         }
       }

From 45154066509eb766e31660e692d59c3967e6a59c Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Wed, 11 Dec 2024 10:36:22 +0800
Subject: [PATCH 36/37] Support running Databricks CI_PART2 integration tests
 with JARs built by CI_PART1 [databricks] (#11840)

* Support running Databricks CI_PART2 integration tests with JARs built by CI_PART1

To fix: https://github.com/NVIDIA/spark-rapids/issues/11838

The CI_PART1 job uploads the built Spark Rapids tar file to Databricks DBFS storage.

The CI_PART2 job retrieves the built tar file from DBFS storage and runs integration tests against it.

Then the CI_PART2 job doesn't need to duplicate the building of Spark Rapids jars; it can save about 1 hour of Databricks time.

Signed-off-by: timl <timl@nvidia.com>

* Check rapids plugin built tar in Databricks Jenkinsfile

Signed-off-by: Tim Liu <timl@nvidia.com>

* Check if the comma-separated files exist in the Databricks DBFS path within timeout minutes

Signed-off-by: Tim Liu <timl@nvidia.com>

* CI_PART2 build plugin jars after the timeout

Signed-off-by: Tim Liu <timl@nvidia.com>

* Let CI2 to do the eventually cleanup

Signed-off-by: Tim Liu <timl@nvidia.com>

---------

Signed-off-by: timl <timl@nvidia.com>
Signed-off-by: Tim Liu <timl@nvidia.com>
---
 jenkins/Jenkinsfile-blossom.premerge          |  9 +++-
 .../Jenkinsfile-blossom.premerge-databricks   | 48 ++++++++++++++-----
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge
index d61638d901a..0c1b9b7d626 100755
--- a/jenkins/Jenkinsfile-blossom.premerge
+++ b/jenkins/Jenkinsfile-blossom.premerge
@@ -39,6 +39,8 @@ def skipped = false
 def db_build = false
 def sourcePattern = 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,' +
     'sql-plugin/src/main/java/,sql-plugin/src/main/scala/'
+// The path where the CI_PART1 job shares rapids plugin built tars with the CI_PART job
+def plugin_built_dir = "dbfs:/cicd/$BUILD_TAG"
 
 pipeline {
     agent {
@@ -281,12 +283,14 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     steps {
                         script {
                             githubHelper.updateCommitStatus("", "Running - includes databricks", GitHubCommitState.PENDING)
+                            //CI_PART1 upload plugin buit tars to PLUGIN_BUILT_DIR for CI_PART2
                             def DBJob = build(job: 'rapids-databricks_premerge-github',
                                 propagate: false, wait: true,
                                 parameters: [
                                         string(name: 'REF', value: params.REF),
                                         string(name: 'GITHUB_DATA', value: params.GITHUB_DATA),
-                                        string(name: 'TEST_MODE', value: 'CI_PART1')
+                                        string(name: 'TEST_MODE', value: 'CI_PART1'),
+                                        string(name: 'PLUGIN_BUILT_DIR', value: "$plugin_built_dir"),
                                 ])
                             if ( DBJob.result != 'SUCCESS' ) {
                                 // Output Databricks failure logs to uploaded onto the pre-merge PR
@@ -310,7 +314,8 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                                 parameters: [
                                         string(name: 'REF', value: params.REF),
                                         string(name: 'GITHUB_DATA', value: params.GITHUB_DATA),
-                                        string(name: 'TEST_MODE', value: 'CI_PART2')
+                                        string(name: 'TEST_MODE', value: 'CI_PART2'),
+                                        string(name: 'PLUGIN_BUILT_DIR', value: "$plugin_built_dir"),
                                 ])
                             if ( DBJob.result != 'SUCCESS' ) {
                                 // Output Databricks failure logs to uploaded onto the pre-merge PR
diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index abf799371b3..7e0e4389cac 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -53,6 +53,8 @@ pipeline {
             description: 'Json-formatted github data from upstream blossom-ci')
         choice(name: 'TEST_MODE', choices: ['CI_PART1', 'CI_PART2'],
             description: 'Separate integration tests into 2 parts, and run each part in parallell')
+        string(name: 'PLUGIN_BUILT_DIR', defaultValue: 'dbfs:/cicd',
+            description: 'CI_PART1 uploads spark-rapids built tgz for CI_PART2')
     }
 
     environment {
@@ -77,7 +79,7 @@ pipeline {
                 script {
                     githubHelper = GithubHelper.getInstance("${GITHUB_TOKEN}", params.GITHUB_DATA)
                     // desc contains the PR ID and can be accessed from different builds
-                    currentBuild.description = githubHelper.getBuildDescription()
+                    currentBuild.description = githubHelper.getBuildDescription() + " | $TEST_MODE"
                     checkoutCode(githubHelper.getCloneUrl(), githubHelper.getMergedSHA())
                 }
             }
@@ -138,7 +140,24 @@ void databricksBuild() {
     def CLUSTER_ID = ''
     def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
     def dbStep = ''
+    def pluginBuiltTar = "$PLUGIN_BUILT_DIR/$DB_RUNTIME/spark-rapids-built.tgz"
+    // Map DBFS path to the local path into the cluster
+    def buildArgs = (params.TEST_MODE == 'CI_PART1') ? pluginBuiltTar.replace('dbfs:/', '/dbfs/') : ''
+    def testArgs = (params.TEST_MODE == 'CI_PART2') ?  pluginBuiltTar.replace('dbfs:/', '/dbfs/') : ''
+
     try {
+        // wait for all the rapids plugin tars built in CI_PART1 to be ready
+        if (params.TEST_MODE == 'CI_PART2') {
+            // Check if the comma-separated files exist in the Databricks DBFS path within timeout minutes
+            if (DbUtils.filesExist(this, "$pluginBuiltTar", 60)) {
+                println('Rapids plugin built tars are ready for CI_PART2')
+            } else {
+                println('Rapids plugin built tars are not ready, CI_PART2 starts building them')
+                testArgs = '' // To let CI_PART2 build rapids plugin after the timeout
+                buildArgs = '' // To let CI_PART2 NOT upload plugin tars
+            }
+        }
+
         stage("Create $SPARK_MAJOR DB") {
             dbStep = 'CREATE'
             // Add the init_script parameter, e.g. oo.sh,bar.sh --> /path/foo.sh,/path/bar.sh
@@ -148,17 +167,21 @@ void databricksBuild() {
             echo CLUSTER_ID
         }
 
-        stage("Build against $SPARK_MAJOR DB") {
-            sh "rm -rf spark-rapids-ci.tgz"
-            sh "tar -zcf spark-rapids-ci.tgz * .git"
-            dbStep = 'BUILD'
-            withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
-                def BUILD_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
-                retry(3) {
-                    sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS"
+        if (params.TEST_MODE == 'CI_PART1' || (params.TEST_MODE == 'CI_PART2' && testArgs == '')) {
+            stage("Build against $SPARK_MAJOR DB") {
+                sh "rm -rf spark-rapids-ci.tgz"
+                sh "tar -zcf spark-rapids-ci.tgz * .git"
+                dbStep = 'BUILD'
+                withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
+                    def BUILD_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
+                    retry(3) {
+                        // Back-up built tar to the path "$buildArgs" on Databricks cluster
+                        // Refer to https://github.com/NVIDIA/spark-rapids/pull/11788/files#diff-dd60414e554e6bed881c3a7e14de334f3e52f36f81643412cd2497c275f8aee9R190-R194
+                        sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS $buildArgs"
+                    }
                 }
+                sh "rm spark-rapids-ci.tgz"
             }
-            sh "rm spark-rapids-ci.tgz"
         }
 
         // TODO: Temporarily skip tests on Databricks 14.3 until the test failures are fixed
@@ -167,7 +190,9 @@ void databricksBuild() {
                 dbStep = 'TEST'
                 withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
                     def TEST_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
-                    sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS"
+                    // Get built tar from the path "$testArgs" on Databricks cluster
+                    // Refer to https://github.com/NVIDIA/spark-rapids/pull/11788/files#diff-db28879431d57d0e454a2c7ee89fdda9abdec463c61771333d6a6565bf96c062R52-R55
+                    sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS $testArgs"
                 }
             }
         }
@@ -175,6 +200,7 @@ void databricksBuild() {
         if (CLUSTER_ID) {
             (dbStep == 'TEST') ? common.publishPytestResult(this, "Test against $SPARK_MAJOR DB") : ''
             retry(3) {
+                params.TEST_MODE == 'CI_PART2' ? DbUtils.cleanUp(this, "$PLUGIN_BUILT_DIR/$DB_RUNTIME") : ''
                 env.INIT_SCRIPTS ? DbUtils.cleanUp(this, env.INIT_SCRIPTS_DIR) : ''
                 sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
             }

From 81b0b98180cbc88fbe4ce4b810950d9d966e9f26 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Wed, 11 Dec 2024 13:21:42 +0800
Subject: [PATCH 37/37] Increase the pre-merge CI timeout to 6 hours (#11857)

I've seen several cases of PRs timing out after 4 hours though we've done a re-balance for 25.02 recently
https://github.com/NVIDIA/spark-rapids/pull/11826

We'll make additional efforts to balance the pre-merge CI's duration.

Let's increase the timeout to 6 hours first.

We'll continue to work on balancing the pre-merge CI's duration

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 jenkins/Jenkinsfile-blossom.premerge | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge
index d61638d901a..474b14e66c4 100755
--- a/jenkins/Jenkinsfile-blossom.premerge
+++ b/jenkins/Jenkinsfile-blossom.premerge
@@ -190,7 +190,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     steps {
                         script {
                             container('gpu') {
-                                timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
+                                timeout(time: 6, unit: 'HOURS') { // step only timeout for test run
                                     try {
                                         sh "$PREMERGE_SCRIPT mvn_verify"
                                         step([$class                : 'JacocoPublisher',
@@ -228,7 +228,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                         script {
                             unstash "source_tree"
                             container('gpu') {
-                                timeout(time: 4, unit: 'HOURS') {
+                                timeout(time: 6, unit: 'HOURS') {
                                     try {
                                         sh "$PREMERGE_SCRIPT ci_2"
                                     } finally {
@@ -260,7 +260,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                         script {
                             unstash "source_tree"
                             container('gpu') {
-                                timeout(time: 4, unit: 'HOURS') {
+                                timeout(time: 6, unit: 'HOURS') {
                                     try {
                                         sh "$PREMERGE_SCRIPT ci_scala213"
                                     } finally {