From 342b67bff36b55c6305cc60f44f7ae77c0bff24b Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Thu, 16 Nov 2023 08:43:11 +0800
Subject: [PATCH 01/15] Initiate project version 24.02.0-SNAPSHOT (#9716)

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 README.md                                            |  2 +-
 aggregator/pom.xml                                   |  4 ++--
 api_validation/pom.xml                               |  4 ++--
 datagen/README.md                                    |  6 +++---
 datagen/ScaleTest.md                                 |  2 +-
 datagen/pom.xml                                      |  4 ++--
 delta-lake/delta-20x/pom.xml                         |  4 ++--
 delta-lake/delta-21x/pom.xml                         |  4 ++--
 delta-lake/delta-22x/pom.xml                         |  4 ++--
 delta-lake/delta-23x/pom.xml                         |  4 ++--
 delta-lake/delta-24x/pom.xml                         |  4 ++--
 delta-lake/delta-spark321db/pom.xml                  |  4 ++--
 delta-lake/delta-spark330db/pom.xml                  |  4 ++--
 delta-lake/delta-spark332db/pom.xml                  |  4 ++--
 delta-lake/delta-spark341db/pom.xml                  |  4 ++--
 delta-lake/delta-stub/pom.xml                        |  4 ++--
 dist/pom.xml                                         |  4 ++--
 docs/configs.md                                      |  2 +-
 docs/dev/shims.md                                    | 12 ++++++------
 integration_tests/README.md                          |  6 +++---
 integration_tests/ScaleTest.md                       |  2 +-
 integration_tests/pom.xml                            |  4 ++--
 jdk-profiles/pom.xml                                 |  4 ++--
 jenkins/databricks/create.py                         |  2 +-
 jenkins/databricks/init_cudf_udf.sh                  |  2 +-
 jenkins/version-def.sh                               |  6 +++---
 pom.xml                                              |  3 ++-
 scala2.13/aggregator/pom.xml                         |  4 ++--
 scala2.13/api_validation/pom.xml                     |  4 ++--
 scala2.13/datagen/pom.xml                            |  4 ++--
 scala2.13/delta-lake/delta-20x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-21x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-22x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-23x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-24x/pom.xml               |  4 ++--
 scala2.13/delta-lake/delta-spark321db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark330db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark332db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-spark341db/pom.xml        |  4 ++--
 scala2.13/delta-lake/delta-stub/pom.xml              |  4 ++--
 scala2.13/dist/pom.xml                               |  4 ++--
 scala2.13/integration_tests/pom.xml                  |  4 ++--
 scala2.13/jdk-profiles/pom.xml                       |  4 ++--
 scala2.13/pom.xml                                    |  3 ++-
 scala2.13/shim-deps/cloudera/pom.xml                 |  4 ++--
 scala2.13/shim-deps/databricks/pom.xml               |  4 ++--
 scala2.13/shim-deps/pom.xml                          |  4 ++--
 scala2.13/shuffle-plugin/pom.xml                     |  4 ++--
 scala2.13/sql-plugin-api/pom.xml                     |  4 ++--
 scala2.13/sql-plugin/pom.xml                         |  4 ++--
 scala2.13/tests/pom.xml                              |  4 ++--
 scala2.13/udf-compiler/pom.xml                       |  4 ++--
 shim-deps/cloudera/pom.xml                           |  4 ++--
 shim-deps/databricks/pom.xml                         |  4 ++--
 shim-deps/pom.xml                                    |  4 ++--
 shuffle-plugin/pom.xml                               |  4 ++--
 sql-plugin-api/pom.xml                               |  4 ++--
 .../scala/com/nvidia/spark/rapids/ShimLoader.scala   |  8 ++++----
 sql-plugin/pom.xml                                   |  4 ++--
 .../scala/com/nvidia/spark/rapids/RapidsConf.scala   |  2 +-
 tests/pom.xml                                        |  4 ++--
 udf-compiler/pom.xml                                 |  4 ++--
 62 files changed, 126 insertions(+), 124 deletions(-)

diff --git a/README.md b/README.md
index 46d846aa9c2..d669be64ca9 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <scope>provided</scope>
 </dependency>
 ```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index f2fc06a370f..e6b3486bf09 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/api_validation/pom.xml b/api_validation/pom.xml
index 34c2404c3c0..d85b6a68146 100644
--- a/api_validation/pom.xml
+++ b/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/datagen/README.md b/datagen/README.md
index f374e4da9f2..5fc3aa06de3 100644
--- a/datagen/README.md
+++ b/datagen/README.md
@@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0.
 
 After this the jar should be at
 `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar`
-for example a Spark 3.3.0 jar for the 23.12.0 release would be
-`target/datagen_2.12-23.12.0-spark330.jar`
+for example a Spark 3.3.0 jar for the 24.02.0 release would be
+`target/datagen_2.12-24.02.0-spark330.jar`
 
 To get a spark shell with this you can run
 ```shell
-spark-shell --jars target/datagen_2.12-23.12.0-spark330.jar
+spark-shell --jars target/datagen_2.12-24.02.0-spark330.jar
 ```
 
 After that you should be good to go.
diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md
index dc55ffd393a..fd2af1decbb 100644
--- a/datagen/ScaleTest.md
+++ b/datagen/ScaleTest.md
@@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME
-./target/datagen_2.12-23.12.0-SNAPSHOT-spark332.jar \
+./target/datagen_2.12-24.02.0-SNAPSHOT-spark332.jar \
 1 \
 10 \
 parquet \
diff --git a/datagen/pom.xml b/datagen/pom.xml
index d22f874bb04..6a6129ac603 100644
--- a/datagen/pom.xml
+++ b/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.12</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml
index 5cb0e2e2e4e..8a06a26a69c 100644
--- a/delta-lake/delta-20x/pom.xml
+++ b/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml
index 5b4e1225722..547cb52a9f8 100644
--- a/delta-lake/delta-21x/pom.xml
+++ b/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml
index 0b6d2175f2f..f52c3ab0f7c 100644
--- a/delta-lake/delta-22x/pom.xml
+++ b/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml
index 9b8cb489cb6..02372462348 100644
--- a/delta-lake/delta-23x/pom.xml
+++ b/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml
index 93f625397bf..b793ec7c393 100644
--- a/delta-lake/delta-24x/pom.xml
+++ b/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/delta-lake/delta-spark321db/pom.xml b/delta-lake/delta-spark321db/pom.xml
index 95f9146f51a..1514904d03f 100644
--- a/delta-lake/delta-spark321db/pom.xml
+++ b/delta-lake/delta-spark321db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark321db</rapids.module>
diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml
index c8ed34bd539..eef0b56b0ae 100644
--- a/delta-lake/delta-spark330db/pom.xml
+++ b/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml
index 1d81d63aa94..34335400b43 100644
--- a/delta-lake/delta-spark332db/pom.xml
+++ b/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml
index 64e920eb8f1..11291f9ad21 100644
--- a/delta-lake/delta-spark341db/pom.xml
+++ b/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
 	<groupId>com.nvidia</groupId>
 	<artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml
index c58eb185cfc..ac77668efda 100644
--- a/delta-lake/delta-stub/pom.xml
+++ b/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/dist/pom.xml b/dist/pom.xml
index dd46404e33d..eb5f49876d2 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/docs/configs.md b/docs/configs.md
index 5a467ea9fa0..58d4e28d79d 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.concurrentGpuTasks=2
 ```
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index cca778382b8..e214d07862d 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
 Spark 3.0.2's URLs:
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark302/
 ```
 
 Spark 3.2.0's URLs :
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark320/
 ```
 
 ### Late Inheritance in Public Classes
diff --git a/integration_tests/README.md b/integration_tests/README.md
index af203f44ad9..11687baa2d8 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -254,7 +254,7 @@ individually, so you don't risk running unit tests along with the integration te
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell
-spark-shell --jars rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -277,7 +277,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -376,7 +376,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
 ```
 
 ### Enabling fuzz tests
diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md
index 774fb906cf6..3cf4b3a25d9 100644
--- a/integration_tests/ScaleTest.md
+++ b/integration_tests/ScaleTest.md
@@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \
 --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \
-./target/rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-spark332.jar \
+./target/rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-spark332.jar \
 10 \
 100 \
 parquet \
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index e5484e0fd49..21432f5161b 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml
index d9488a8259d..e9d913eb6c6 100644
--- a/jdk-profiles/pom.xml
+++ b/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk9plus</id>
diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
index b0305f92112..289a114d230 100644
--- a/jenkins/databricks/create.py
+++ b/jenkins/databricks/create.py
@@ -27,7 +27,7 @@ def main():
   workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
   token = ''
   sshkey = ''
-  cluster_name = 'CI-GPU-databricks-23.12.0-SNAPSHOT'
+  cluster_name = 'CI-GPU-databricks-24.02.0-SNAPSHOT'
   idletime = 240
   runtime = '7.0.x-gpu-ml-scala2.12'
   num_workers = 1
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 4af278e3a97..5816706b982 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -20,7 +20,7 @@
 
 set -ex
 
-CUDF_VER=${CUDF_VER:-23.12}
+CUDF_VER=${CUDF_VER:-23.12} # TODO: https://github.com/NVIDIA/spark-rapids/issues/9715
 CUDA_VER=${CUDA_VER:-11.8}
 
 # Need to explicitly add conda into PATH environment, to activate conda environment.
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index ebf75617d99..845e9053e67 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -26,10 +26,10 @@ for VAR in $OVERWRITE_PARAMS; do
 done
 IFS=$PRE_IFS
 
-CUDF_VER=${CUDF_VER:-"23.12.0-SNAPSHOT"}
+CUDF_VER=${CUDF_VER:-"23.12.0-SNAPSHOT"} # TODO: https://github.com/NVIDIA/spark-rapids/issues/9715
 CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
-PROJECT_VER=${PROJECT_VER:-"23.12.0-SNAPSHOT"}
-PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.12.0-SNAPSHOT"}
+PROJECT_VER=${PROJECT_VER:-"24.02.0-SNAPSHOT"}
+PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.02.0-SNAPSHOT"}
 SPARK_VER=${SPARK_VER:-"3.1.1"}
 SPARK_VER_213=${SPARK_VER_213:-"3.3.0"}
 # Make a best attempt to set the default value for the shuffle shim.
diff --git a/pom.xml b/pom.xml
index afb519ffc03..2672e3a8609 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -635,6 +635,7 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
+        <!--TODO: https://github.com/NVIDIA/spark-rapids/issues/9715 -->
         <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
         <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml
index 4b6aca7d716..d659070a5f0 100644
--- a/scala2.13/aggregator/pom.xml
+++ b/scala2.13/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml
index 02dbbf7017f..f4f6afeb861 100644
--- a/scala2.13/api_validation/pom.xml
+++ b/scala2.13/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.13</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml
index 5982ab5eeff..468c8327f51 100644
--- a/scala2.13/datagen/pom.xml
+++ b/scala2.13/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.13</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml
index 688d0154734..d1634c0850a 100644
--- a/scala2.13/delta-lake/delta-20x/pom.xml
+++ b/scala2.13/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml
index 8a5b5d0b8f4..da8c43b44e3 100644
--- a/scala2.13/delta-lake/delta-21x/pom.xml
+++ b/scala2.13/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml
index 58d417bb1ed..4d9f6b5f0c5 100644
--- a/scala2.13/delta-lake/delta-22x/pom.xml
+++ b/scala2.13/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml
index 6193d34ab44..209e07f55d5 100644
--- a/scala2.13/delta-lake/delta-23x/pom.xml
+++ b/scala2.13/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml
index 6aa94f5a546..e0652b725ac 100644
--- a/scala2.13/delta-lake/delta-24x/pom.xml
+++ b/scala2.13/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark321db/pom.xml b/scala2.13/delta-lake/delta-spark321db/pom.xml
index c0c0bbc0385..30734e3fd14 100644
--- a/scala2.13/delta-lake/delta-spark321db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark321db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark321db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml
index 9ba4fd9f742..df1ad622d85 100644
--- a/scala2.13/delta-lake/delta-spark330db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml
index 506e2d392c7..56401c437e2 100644
--- a/scala2.13/delta-lake/delta-spark332db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml
index e8d7d0dd644..6bc0a33a51a 100644
--- a/scala2.13/delta-lake/delta-spark341db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
 	<groupId>com.nvidia</groupId>
 	<artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml
index 2a0549cbbab..f62d6f653b2 100644
--- a/scala2.13/delta-lake/delta-stub/pom.xml
+++ b/scala2.13/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml
index a065880fcfb..24b376bc2b8 100644
--- a/scala2.13/dist/pom.xml
+++ b/scala2.13/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml
index a896a2773a3..3157095dca2 100644
--- a/scala2.13/integration_tests/pom.xml
+++ b/scala2.13/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.13</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml
index 5e730d89469..52d9bd68f87 100644
--- a/scala2.13/jdk-profiles/pom.xml
+++ b/scala2.13/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk9plus</id>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 629692d6e65..c6e3eaf199f 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -635,6 +635,7 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
+        <!--TODO: https://github.com/NVIDIA/spark-rapids/issues/9715 -->
         <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
         <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.13</scala.binary.version>
diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml
index 80b46920047..86a300fe9fc 100644
--- a/scala2.13/shim-deps/cloudera/pom.xml
+++ b/scala2.13/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml
index d9fc8a42b83..6c2c9254fbd 100644
--- a/scala2.13/shim-deps/databricks/pom.xml
+++ b/scala2.13/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/scala2.13/shim-deps/pom.xml b/scala2.13/shim-deps/pom.xml
index 163171da7e1..a6c324f3c07 100644
--- a/scala2.13/shim-deps/pom.xml
+++ b/scala2.13/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml
index 7432ac01a9c..47416b67a01 100644
--- a/scala2.13/shuffle-plugin/pom.xml
+++ b/scala2.13/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-      <version>23.12.0-SNAPSHOT</version>
+      <version>24.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml
index d885f745cdb..b6a97b9e678 100644
--- a/scala2.13/sql-plugin-api/pom.xml
+++ b/scala2.13/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.13</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml
index 02090fb5e7e..3a812f32441 100644
--- a/scala2.13/sql-plugin/pom.xml
+++ b/scala2.13/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/scala2.13/tests/pom.xml b/scala2.13/tests/pom.xml
index 3d50da1fdbc..0223ff96752 100644
--- a/scala2.13/tests/pom.xml
+++ b/scala2.13/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml
index 4920c358d42..601750b8af5 100644
--- a/scala2.13/udf-compiler/pom.xml
+++ b/scala2.13/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>
diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml
index f7f10b0ab56..c72c86a4d31 100644
--- a/shim-deps/cloudera/pom.xml
+++ b/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml
index efcabc3e72b..1e14d255bcc 100644
--- a/shim-deps/databricks/pom.xml
+++ b/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/shim-deps/pom.xml b/shim-deps/pom.xml
index b0a8f5ac7b5..e0b0ed81e3a 100644
--- a/shim-deps/pom.xml
+++ b/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index fe7894874da..d95556b200d 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-      <version>23.12.0-SNAPSHOT</version>
+      <version>24.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/sql-plugin-api/pom.xml b/sql-plugin-api/pom.xml
index 1ccc411ca30..47f6a84a31d 100644
--- a/sql-plugin-api/pom.xml
+++ b/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.12</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
index 3723575810b..47a37278f9b 100644
--- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
+++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -49,11 +49,11 @@ import org.apache.spark.util.MutableURLClassLoader
     Each shim can see a consistent parallel world without conflicts by referencing
     only one conflicting directory.
     E.g., Spark 3.2.0 Shim will use
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark320/
     Spark 3.1.1 will use
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark311/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark311/
     Using these Jar URL's allows referencing different bytecode produced from identical sources
     by incompatible Scala / Spark dependencies.
  */
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index 9d752b57f8d..b542a3e82fa 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index b0c2edadf95..5fd64bdcbb6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -2091,7 +2091,7 @@ object RapidsConf {
         |On startup use: `--conf [conf key]=[conf value]`. For example:
         |
         |```
-        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \
+        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar \
         |--conf spark.plugins=com.nvidia.spark.SQLPlugin \
         |--conf spark.rapids.sql.concurrentGpuTasks=2
         |```
diff --git a/tests/pom.xml b/tests/pom.xml
index 863c4231b71..de068419ed0 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml
index adf26a680ca..f258e6a6a9f 100644
--- a/udf-compiler/pom.xml
+++ b/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>

From 0cbd37aea08b04cabed38adaa3d47185c36d8c03 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 22 Nov 2023 09:07:13 +0800
Subject: [PATCH 02/15] Update JNI and private dep version to 24.02.0-SNAPSHOT
 (#9812)

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 jenkins/databricks/init_cudf_udf.sh | 2 +-
 jenkins/version-def.sh              | 2 +-
 pom.xml                             | 5 ++---
 scala2.13/pom.xml                   | 5 ++---
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 5816706b982..a0c7de590d7 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -20,7 +20,7 @@
 
 set -ex
 
-CUDF_VER=${CUDF_VER:-23.12} # TODO: https://github.com/NVIDIA/spark-rapids/issues/9715
+CUDF_VER=${CUDF_VER:-24.02}
 CUDA_VER=${CUDA_VER:-11.8}
 
 # Need to explicitly add conda into PATH environment, to activate conda environment.
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index 845e9053e67..98894c0d548 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -26,7 +26,7 @@ for VAR in $OVERWRITE_PARAMS; do
 done
 IFS=$PRE_IFS
 
-CUDF_VER=${CUDF_VER:-"23.12.0-SNAPSHOT"} # TODO: https://github.com/NVIDIA/spark-rapids/issues/9715
+CUDF_VER=${CUDF_VER:-"24.02.0-SNAPSHOT"}
 CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
 PROJECT_VER=${PROJECT_VER:-"24.02.0-SNAPSHOT"}
 PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.02.0-SNAPSHOT"}
diff --git a/pom.xml b/pom.xml
index 21ed8afb262..6cee08f7f89 100644
--- a/pom.xml
+++ b/pom.xml
@@ -635,9 +635,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <!--TODO: https://github.com/NVIDIA/spark-rapids/issues/9715 -->
-        <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>24.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>24.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 80cac0a1e6c..6276b7721b8 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -635,9 +635,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <!--TODO: https://github.com/NVIDIA/spark-rapids/issues/9715 -->
-        <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>24.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>24.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.13</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>

From fd05f70824e32174429e7ea11957d0cc90b7173b Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Tue, 28 Nov 2023 08:44:29 +0800
Subject: [PATCH 03/15] Skip redundant steps in nightly build [skip ci] (#9857)

* Skip redundant steps in nightly build script

Signed-off-by: Peixin Li <pxli@nyu.edu>

* set ono-dist to be deployed \for now

Signed-off-by: Peixin Li <pxli@nyu.edu>

* disable history expantion for DEPLOY_SUBMODULES ENV

---------

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 jenkins/spark-nightly-build.sh | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh
index 969837ee397..7f391db6957 100755
--- a/jenkins/spark-nightly-build.sh
+++ b/jenkins/spark-nightly-build.sh
@@ -102,6 +102,8 @@ function distWithReducedPom {
 
 # option to skip unit tests. Used in our CI to separate test runs in parallel stages
 SKIP_TESTS=${SKIP_TESTS:-"false"}
+set +H # turn off history expansion
+DEPLOY_SUBMODULES=${DEPLOY_SUBMODULES:-"!${DIST_PL}"} # TODO: deploy only required submodules to save time
 for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
     $MVN -U -B clean install $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \
         -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \
@@ -117,10 +119,12 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
     fi
     distWithReducedPom "install"
     [[ $SKIP_DEPLOY != 'true' ]] && \
-        $MVN -B deploy -pl '!dist' $MVN_URM_MIRROR \
+        # this deploys selected submodules
+        $MVN -B deploy -pl $DEPLOY_SUBMODULES $MVN_URM_MIRROR \
             -Dmaven.repo.local=$M2DIR \
             -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \
             -DskipTests \
+            -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \
             -Dbuildver="${buildver}"
 done
 
@@ -161,10 +165,11 @@ distWithReducedPom "install"
 if [[ $SKIP_DEPLOY != 'true' ]]; then
     distWithReducedPom "deploy"
 
-    # this deploys submodules except dist that is unconditionally built with Spark 3.1.1
-    $MVN -B deploy -pl '!dist' \
+    # this deploys selected submodules that is unconditionally built with Spark 3.1.1
+    $MVN -B deploy -pl $DEPLOY_SUBMODULES \
         -Dbuildver=$SPARK_BASE_SHIM_VERSION \
-        -DskipTests=$SKIP_TESTS \
+        -DskipTests \
+        -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \
         $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \
         -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER
 fi

From 6f96048be808715a89e12d61d9dae22e48a74f1d Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 28 Nov 2023 16:20:13 +0800
Subject: [PATCH 04/15] Detect multiple jars on the classpath when init plugin
 [databricks] (#9654)

* Detect multiple jars on the classpath when init plugin

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* print version info and also check jni/cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* add config for allowing multiple jars

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* keep jar path in error messages

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Use unique properties for intermediate jars

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comment

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Apply suggestions from code review

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* add the project.artifactId to build-info and check it

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove unnecessary copyright update

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* remove log

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Add 2.13 support

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* use revision to check duplicate jars

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix 2.13 build

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* support both SAME_REVISION and NEVER mode

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Avoid CI change and filter out test

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* check values for config

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* use enum

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix two nits

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Do not print log if no multiple jar

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* ignore subdir when checking multiple jars

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* Update sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala

Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>

* wip ut

* address comment

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
Co-authored-by: Gera Shegalov <gshegalov@nvidia.com>
---
 .../com/nvidia/spark/rapids/Plugin.scala      | 67 +++++++++++++++++++
 .../com/nvidia/spark/rapids/RapidsConf.scala  | 27 ++++++++
 2 files changed, 94 insertions(+)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index 224a530bf99..2d069eca9ab 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids
 
 import java.lang.reflect.InvocationTargetException
+import java.net.URL
 import java.time.ZoneId
 import java.util.Properties
 
@@ -25,6 +26,7 @@ import scala.sys.process._
 import scala.util.Try
 
 import ai.rapids.cudf.{Cuda, CudaException, CudaFatalException, CudfException, MemoryCleaner}
+import com.nvidia.spark.rapids.RapidsConf.AllowMultipleJars
 import com.nvidia.spark.rapids.filecache.{FileCache, FileCacheLocalityManager, FileCacheLocalityMsg}
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import org.apache.commons.lang3.exception.ExceptionUtils
@@ -112,6 +114,67 @@ object RapidsPluginUtils extends Logging {
     }
   }
 
+  private def detectMultipleJar(propName: String, jarName: String, conf: RapidsConf): Unit = {
+    val classloader = ShimLoader.getShimClassLoader()
+    val possibleRapidsJarURLs = classloader.getResources(propName).asScala.toSet.toSeq.filter {
+      url => {
+        val urlPath = url.toString
+        // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-23.12.0-spark341.jar,
+        // and files stored under subdirs of '!/', e.g. 
+        // rapids-4-spark_2.12-23.12.0-cuda11.jar!/spark330/rapids4spark-version-info.properties
+        // We only want to find the main jar, e.g.
+        // rapids-4-spark_2.12-23.12.0-cuda11.jar!/rapids4spark-version-info.properties
+        !urlPath.contains("rapids-4-spark-") && urlPath.endsWith("!/" + propName)
+      }
+    }
+    val revisionRegex = "revision=(.*)".r
+    val revisionMap: Map[String, Seq[URL]] = possibleRapidsJarURLs.map { url =>
+      val versionInfo = scala.io.Source.fromURL(url).getLines().toSeq
+      val revision = versionInfo
+        .collect { 
+          case revisionRegex(revision) => revision 
+        }
+        .headOption
+        .getOrElse("UNKNOWN")
+      (revision, url)
+    }.groupBy(_._1).mapValues(_.map(_._2)).toMap
+    lazy val rapidsJarsVersMsg = revisionMap.map {
+      case (revision, urls) => {
+        s"revison: $revision" + urls.map {
+          url => "\n\tjar URL: " + url.toString.split("!").head + "\n\t" + 
+              scala.io.Source.fromURL(url).getLines().toSeq.mkString("\n\t")
+        }.mkString + "\n"
+      }
+    }.mkString
+    // scalastyle:off line.size.limit
+    lazy val msg = s"""Multiple $jarName jars found in the classpath:
+        |$rapidsJarsVersMsg
+        |Please make sure there is only one $jarName jar in the classpath.
+        |If it is impossible to fix the classpath you can suppress the error by setting ${RapidsConf.ALLOW_MULTIPLE_JARS.key} to SAME_REVISION or ALWAYS.
+        """.stripMargin
+    // scalastyle:on line.size.limit
+
+    conf.allowMultipleJars match {
+      case AllowMultipleJars.ALWAYS =>
+        if (revisionMap.size != 1 || revisionMap.values.exists(_.size != 1)) {
+          logWarning(msg)
+        }
+      case AllowMultipleJars.SAME_REVISION =>
+        require(revisionMap.size == 1, msg)
+        if (revisionMap.values.exists(_.size != 1)) {
+          logWarning(msg)
+        }
+      case AllowMultipleJars.NEVER =>
+        require(revisionMap.size == 1 && revisionMap.values.forall(_.size == 1), msg)
+    }
+  }
+
+  def detectMultipleJars(conf: RapidsConf): Unit = {
+    detectMultipleJar(PLUGIN_PROPS_FILENAME, "rapids-4-spark", conf)
+    detectMultipleJar(JNI_PROPS_FILENAME, "spark-rapids-jni", conf)
+    detectMultipleJar(CUDF_PROPS_FILENAME, "cudf", conf)
+  }
+
   // This assumes Apache Spark logic, if CSPs are setting defaults differently, we may need
   // to handle.
   def estimateCoresOnExec(conf: SparkConf): Int = {
@@ -310,6 +373,7 @@ class RapidsDriverPlugin extends DriverPlugin with Logging {
     val sparkConf = pluginContext.conf
     RapidsPluginUtils.fixupConfigsOnDriver(sparkConf)
     val conf = new RapidsConf(sparkConf)
+    RapidsPluginUtils.detectMultipleJars(conf)
     RapidsPluginUtils.logPluginMode(conf)
     GpuCoreDumpHandler.driverInit(sc, conf)
 
@@ -364,6 +428,9 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       val numCores = RapidsPluginUtils.estimateCoresOnExec(sparkConf)
       val conf = new RapidsConf(extraConf.asScala.toMap)
 
+      // Fail if there are multiple plugin jars in the classpath.
+      RapidsPluginUtils.detectMultipleJars(conf)
+
       // Compare if the cudf version mentioned in the classpath is equal to the version which
       // plugin expects. If there is a version mismatch, throw error. This check can be disabled
       // by setting this config spark.rapids.cudfVersionOverride=true
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 0fbeec17b97..8ef6b11a632 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1840,6 +1840,22 @@ object RapidsConf {
     .booleanConf
     .createWithDefault(false)
 
+  object AllowMultipleJars extends Enumeration {
+    val ALWAYS, SAME_REVISION, NEVER = Value
+  }
+
+  val ALLOW_MULTIPLE_JARS = conf("spark.rapids.sql.allowMultipleJars")
+    .internal()
+    .startupOnly()
+    .doc("Allow multiple rapids-4-spark, spark-rapids-jni, and cudf jars on the classpath. " +
+      "Spark will take the first one it finds, so the version may not be expected. Possisble " +
+      "values are ALWAYS: allow all jars, SAME_REVISION: only allow jars with the same " +
+      "revision, NEVER: do not allow multiple jars at all.")
+    .stringConf
+    .transform(_.toUpperCase(java.util.Locale.ROOT))
+    .checkValues(AllowMultipleJars.values.map(_.toString))
+    .createWithDefault(AllowMultipleJars.SAME_REVISION.toString)
+
   val ALLOW_DISABLE_ENTIRE_PLAN = conf("spark.rapids.allowDisableEntirePlan")
     .internal()
     .doc("The plugin has the ability to detect possibe incompatibility with some specific " +
@@ -2641,6 +2657,17 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val cudfVersionOverride: Boolean = get(CUDF_VERSION_OVERRIDE)
 
+  lazy val allowMultipleJars: AllowMultipleJars.Value = {
+    get(ALLOW_MULTIPLE_JARS) match {
+      case "ALWAYS" => AllowMultipleJars.ALWAYS
+      case "NEVER" => AllowMultipleJars.NEVER
+      case "SAME_REVISION" => AllowMultipleJars.SAME_REVISION
+      case other =>
+        throw new IllegalArgumentException(s"Internal Error $other is not supported for " +
+            s"${ALLOW_MULTIPLE_JARS.key}")
+    }
+  }
+
   lazy val allowDisableEntirePlan: Boolean = get(ALLOW_DISABLE_ENTIRE_PLAN)
 
   lazy val useArrowCopyOptimization: Boolean = get(USE_ARROW_OPT)

From 33bd58959192d7d6438cf9521dfe201a8f1494bf Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 29 Nov 2023 13:56:01 -0600
Subject: [PATCH 05/15] Update for new retry state machine JNI APIs (#9656)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 .../nvidia/spark/rapids/shuffle/ucx/UCX.scala |   4 +-
 .../shuffle/ucx/UCXShuffleTransport.scala     |  20 +-
 .../rapids/AbstractGpuJoinIterator.scala      |   2 +-
 .../spark/rapids/BatchWithPartitionData.scala |   6 +-
 .../spark/rapids/GpuCoalesceBatches.scala     |   4 +-
 .../spark/rapids/GpuDeviceManager.scala       |  20 +-
 .../spark/rapids/GpuMultiFileReader.scala     |   4 +-
 .../nvidia/spark/rapids/GpuParquetScan.scala  |  12 +-
 .../nvidia/spark/rapids/GpuSemaphore.scala    |   6 +-
 .../com/nvidia/spark/rapids/HostAlloc.scala   | 415 +++++-------------
 .../nvidia/spark/rapids/RapidsBuffer.scala    |  19 +-
 .../com/nvidia/spark/rapids/RapidsConf.scala  |   4 +-
 .../rapids/RapidsDeviceMemoryStore.scala      |  28 +-
 .../spark/rapids/RapidsHostMemoryStore.scala  |   2 +-
 .../RapidsShuffleHeartbeatManager.scala       |   4 +-
 .../spark/rapids/RmmRapidsRetryIterator.scala |  91 ++--
 .../spark/rapids/TaskRegistryTracker.scala    |  76 ++++
 .../spark/rapids/basicPhysicalOperators.scala |   4 +-
 .../rapids/shuffle/BufferReceiveState.scala   |   7 +-
 .../rapids/shuffle/RapidsShuffleClient.scala  |   5 +
 .../shuffle/RapidsShuffleIterator.scala       |  30 +-
 .../sql/rapids/RapidsCachingReader.scala      |   2 +-
 .../shuffle/RapidsShuffleIterator.scala       |  23 +-
 .../sql/rapids/RapidsCachingReader.scala      |   2 +-
 .../nvidia/spark/rapids/HostAllocSuite.scala  | 318 ++------------
 .../rapids/BatchWithPartitionDataSuite.scala  |   6 +-
 ...ternalRowToCudfRowIteratorRetrySuite.scala |   4 +-
 .../rapids/GpuCoalesceBatchesRetrySuite.scala |  14 +-
 .../spark/rapids/GpuSortRetrySuite.scala      |  46 +-
 .../spark/rapids/ProjectExprSuite.scala       |  12 +-
 .../spark/rapids/RmmSparkRetrySuiteBase.scala |   5 +-
 .../RowToColumnarIteratorRetrySuite.scala     |   4 +-
 .../spark/rapids/WindowRetrySuite.scala       |  16 +-
 .../nvidia/spark/rapids/WithRetrySuite.scala  |  26 +-
 .../shuffle/RapidsShuffleClientSuite.scala    |  51 +++
 .../shuffle/RapidsShuffleIteratorSuite.scala  |  15 +-
 .../rapids/GpuFileFormatDataWriterSuite.scala |  10 +-
 37 files changed, 540 insertions(+), 777 deletions(-)
 create mode 100644 sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala

diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index bb22fb7ba9d..6a8336f2a4a 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -108,8 +108,8 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
         .setNameFormat("progress-thread-%d")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // The pending queues are used to enqueue [[PendingReceive]] or [[PendingSend]], from executor
   // task threads and [[progressThread]] will hand them to the UcpWorker thread.
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index 9130d16b945..3a31ae709ec 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -250,8 +250,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
         .setNameFormat("shuffle-transport-client-exec-%d")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()),
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()),
     // if we can't hand off because we are too busy, block the caller (in UCX's case,
     // the progress thread)
     new CallerRunsAndLogs())
@@ -262,8 +262,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat("shuffle-client-copy-thread-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   override def makeClient(blockManagerId: BlockManagerId): RapidsShuffleClient = {
     val peerExecutorId = blockManagerId.executorId.toLong
@@ -286,8 +286,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat(s"shuffle-server-conn-thread-${shuffleServerId.executorId}-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // This is used to queue up on the server all the [[BufferSendState]] as the server waits for
   // bounce buffers to become available (it is the equivalent of the transport's throttle, minus
@@ -297,8 +297,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat(s"shuffle-server-bss-thread-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   /**
    * Construct a server instance
@@ -359,8 +359,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
         .setNameFormat(s"shuffle-transport-throttle-monitor")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // helper class to hold transfer requests that have a bounce buffer
   // and should be ready to be handled by a `BufferReceiveState`
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
index 77c3bbae7b3..cee705d8f8e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
@@ -144,7 +144,7 @@ abstract class AbstractGpuJoinIterator(
         // This withRetry block will always return an iterator with one ColumnarBatch.
         // The gatherer tracks how many rows we have used already.  The withRestoreOnRetry
         // ensures that we restart at the same place in the gatherer.  In the case of a
-        // SplitAndRetryOOM, we retry with a smaller (halved) targetSize, so we are taking
+        // GpuSplitAndRetryOOM, we retry with a smaller (halved) targetSize, so we are taking
         // less from the gatherer, but because the gatherer tracks how much is used, the
         // next call to this function will start in the right place.
         gather.checkpoint()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
index 6d640683c07..f6429ddd709 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
@@ -22,7 +22,7 @@ import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.withRetry
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.{StringType, StructType}
@@ -500,8 +500,8 @@ object BatchWithPartitionDataUtils {
       withResource(batchWithPartData) { _ =>
         // Split partition rows data into two halves
         val splitPartitionData = splitPartitionDataInHalf(batchWithPartData.partitionedRowsData)
-        if(splitPartitionData.length < 2) {
-          throw new SplitAndRetryOOM("GPU OutOfMemory: cannot split input with one row")
+        if (splitPartitionData.length < 2) {
+          throw new GpuSplitAndRetryOOM("GPU OutOfMemory: cannot split input with one row")
         }
         // Split the batch into two halves
         withResource(batchWithPartData.inputBatch.getColumnarBatch()) { cb =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index 23145e56153..e6dc216d7e6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -23,7 +23,7 @@ import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRetry, withRetryNoSplit}
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import com.nvidia.spark.rapids.shims.{ShimExpression, ShimUnaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -671,7 +671,7 @@ abstract class AbstractGpuCoalesceIterator(
         val it = batchesToCoalesce.batches
         val numBatches = it.length
         if (numBatches <= 1) {
-          throw new SplitAndRetryOOM(s"Cannot split a sequence of $numBatches batches")
+          throw new GpuSplitAndRetryOOM(s"Cannot split a sequence of $numBatches batches")
         }
         val res = it.splitAt(numBatches / 2)
         Seq(BatchesToCoalesce(res._1), BatchesToCoalesce(res._2))
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
index 7a8a028a7f2..10761eef47b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
@@ -386,7 +386,7 @@ object GpuDeviceManager extends Logging {
 
   private def initializeOffHeapLimits(gpuId: Int, rapidsConf: Option[RapidsConf]): Unit = {
     val conf = rapidsConf.getOrElse(new RapidsConf(SparkEnv.get.conf))
-    val pinnedSize = if (conf.offHeapLimitEnabled) {
+    val (pinnedSize, nonPinnedLimit) = if (conf.offHeapLimitEnabled) {
       logWarning("OFF HEAP MEMORY LIMITS IS ENABLED. " +
           "THIS IS EXPERIMENTAL FOR NOW USE WITH CAUTION")
       val perTaskOverhead = conf.perTaskOverhead
@@ -425,9 +425,9 @@ object GpuDeviceManager extends Logging {
       } else {
         memoryLimit
       }
-      // TODO need to configure the limits when we have those APIs available, and log what those
-      //  limits are
-      if (confPinnedSize + totalOverhead <= finalMemoryLimit) {
+
+      // Now we need to know the pinned vs non-pinned limits
+      val pinnedLimit = if (confPinnedSize + totalOverhead <= finalMemoryLimit) {
         confPinnedSize
       } else {
         val ret = finalMemoryLimit - totalOverhead
@@ -437,13 +437,23 @@ object GpuDeviceManager extends Logging {
             s"dropping pinned memory to ${ret / 1024 / 1024.0} MiB")
         ret
       }
+      val nonPinnedLimit = finalMemoryLimit - totalOverhead - pinnedLimit
+      logWarning(s"Off Heap Host Memory configured to be " +
+          s"${pinnedLimit / 1024 / 1024.0} MiB pinned, " +
+          s"${nonPinnedLimit / 1024 / 1024.0} MiB non-pinned, and " +
+          s"${totalOverhead / 1024 / 1024.0} MiB of untracked overhead.")
+      (pinnedLimit, nonPinnedLimit)
     } else {
-      conf.pinnedPoolSize
+      (conf.pinnedPoolSize, -1L)
     }
     if (!PinnedMemoryPool.isInitialized && pinnedSize > 0) {
       logInfo(s"Initializing pinned memory pool (${pinnedSize / 1024 / 1024.0} MiB)")
       PinnedMemoryPool.initialize(pinnedSize, gpuId)
     }
+    if (nonPinnedLimit >= 0) {
+      // Host memory limits must be set after the pinned memory pool is initialized
+      HostAlloc.initialize(nonPinnedLimit)
+    }
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index e4af2e4b6de..3aa3a2d48a7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -30,7 +30,7 @@ import ai.rapids.cudf.{HostMemoryBuffer, NvtxColor, NvtxRange, Table}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -1029,7 +1029,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
    * Set this to a splitter instance when chunked reading is supported
    */
   def chunkedSplit(buffer: HostMemoryBuffer): Seq[HostMemoryBuffer] = {
-    throw new SplitAndRetryOOM("Split is not currently supported")
+    throw new GpuSplitAndRetryOOM("Split is not currently supported")
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
index 207b6ddaa9b..fcc6c20a42c 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -37,7 +37,7 @@ import com.nvidia.spark.rapids.ParquetPartitionReader.{CopyRange, LocalCopy}
 import com.nvidia.spark.rapids.RapidsConf.ParquetFooterReaderType
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.filecache.FileCache
-import com.nvidia.spark.rapids.jni.{DateTimeRebase, ParquetFooter, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{DateTimeRebase, GpuSplitAndRetryOOM, ParquetFooter}
 import com.nvidia.spark.rapids.shims.{ColumnDefaultValuesShims, GpuParquetCrypto, GpuTypeShims, ParquetLegacyNanoAsLongShims, ParquetSchemaClipShims, ParquetStringPredShims, ReaderUtils, ShimFilePartitionReaderFactory, SparkShimImpl}
 import org.apache.commons.io.IOUtils
 import org.apache.commons.io.output.{CountingOutputStream, NullOutputStream}
@@ -248,7 +248,7 @@ object GpuParquetScan {
 
   /**
    * Check that we can split the targetBatchSize and then return a split targetBatchSize. This
-   * is intended to be called from the SplitAndRetryOOM handler for all implementations of
+   * is intended to be called from the GpuSplitAndRetryOOM handler for all implementations of
    * the parquet reader
    * @param targetBatchSize the current target batch size.
    * @param useChunkedReader if chunked reading is enabled. This only works if chunked reading is
@@ -257,13 +257,13 @@ object GpuParquetScan {
    */
   def splitTargetBatchSize(targetBatchSize: Long, useChunkedReader: Boolean): Long = {
     if (!useChunkedReader) {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs " +
-        "chunked parquet reader is configured off")
+      throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs " +
+          "chunked parquet reader is configured off")
     }
     val ret = targetBatchSize / 2
     if (targetBatchSize < minTargetBatchSizeMiB * 1024 * 1024) {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split input " +
-        s"target batch size to less than $minTargetBatchSizeMiB MiB")
+           throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split input " +
+          s"target batch size to less than $minTargetBatchSizeMiB MiB")
     }
     ret
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
index e3140e1f392..84baf7e9708 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -269,6 +268,8 @@ private final class GpuSemaphore() extends Logging {
   private val tasks = new ConcurrentHashMap[Long, SemaphoreTaskInfo]
 
   def acquireIfNecessary(context: TaskContext): Unit = {
+    // Make sure that the thread/task is registered before we try and block
+    TaskRegistryTracker.registerThreadForRetry()
     GpuTaskMetrics.get.semWaitTime {
       val taskAttemptId = context.taskAttemptId()
       val taskInfo = tasks.computeIfAbsent(taskAttemptId, _ => {
@@ -276,7 +277,6 @@ private final class GpuSemaphore() extends Logging {
         new SemaphoreTaskInfo()
       })
       taskInfo.blockUntilReady(semaphore)
-      RmmSpark.associateCurrentThreadWithTask(taskAttemptId)
       GpuDeviceManager.initializeFromTask()
     }
   }
@@ -286,7 +286,6 @@ private final class GpuSemaphore() extends Logging {
     try {
       val taskAttemptId = context.taskAttemptId()
       GpuTaskMetrics.get.updateRetry(taskAttemptId)
-      RmmSpark.removeCurrentThreadAssociation()
       val taskInfo = tasks.get(taskAttemptId)
       if (taskInfo != null) {
         taskInfo.releaseSemaphore(semaphore)
@@ -299,7 +298,6 @@ private final class GpuSemaphore() extends Logging {
   def completeTask(context: TaskContext): Unit = {
     val taskAttemptId = context.taskAttemptId()
     GpuTaskMetrics.get.updateRetry(taskAttemptId)
-    RmmSpark.taskDone(taskAttemptId)
     val refs = tasks.remove(taskAttemptId)
     if (refs == null) {
       throw new IllegalStateException(s"Completion of unknown task $taskAttemptId")
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
index 587bffe7ebc..3da4a1191b1 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
@@ -16,16 +16,13 @@
 
 package com.nvidia.spark.rapids
 
-import java.util.Comparator
+import ai.rapids.cudf.{HostMemoryBuffer, MemoryBuffer, PinnedMemoryPool}
+import com.nvidia.spark.rapids.jni.RmmSpark
 
-import ai.rapids.cudf.{ColumnView, HostMemoryBuffer, HostMemoryReservation, MemoryBuffer, PinnedMemoryPool}
-import com.nvidia.spark.rapids.HostAlloc.align
+import org.apache.spark.internal.Logging
 
-import org.apache.spark.TaskContext
-
-private class HostAlloc(nonPinnedLimit: Long) {
+private class HostAlloc(nonPinnedLimit: Long) extends Logging {
   private var currentNonPinnedAllocated: Long = 0L
-  private var currentNonPinnedReserved: Long = 0L
   private val pinnedLimit: Long = PinnedMemoryPool.getTotalPoolSizeBytes
   // For now we are going to assume that we are the only ones calling into the pinned pool
   // That is not really true, but should be okay.
@@ -33,52 +30,13 @@ private class HostAlloc(nonPinnedLimit: Long) {
   private val isUnlimited = nonPinnedLimit < 0
   private val isPinnedOnly = nonPinnedLimit == 0
 
-  private val compareBlocks = new Comparator[BlockedAllocation] {
-    override def compare(a: BlockedAllocation, b: BlockedAllocation): Int = {
-      java.lang.Long.compare(a.taskId, b.taskId)
-    }
-  }
-
-  /**
-   * Host memory allocations that are still pending.
-   */
-  private val pendingAllowedQueue = new HashedPriorityQueue[BlockedAllocation](100, compareBlocks)
-
-  /**
-   * An allocation that has not been completed yet. It is blocked waiting for more resources.
-   */
-  private class BlockedAllocation(val amount: Long, val taskId: Long) {
-    private var shouldWake = false
-
-    def isReady: Boolean = shouldWake
-
-    /**
-     * Wait until we should retry the allocation because it might succeed. It is not
-     * guaranteed though.
-     * It is required that the parent lock is held before this is called.
-     */
-    def waitUntilPossiblyReady(): Unit = {
-      while (!shouldWake) {
-        HostAlloc.this.wait(1000)
-      }
-    }
-
-    /**
-     * Wake up all threads that are blocked waiting for an allocation.
-     */
-    def wakeUpItMightBeWorthIt(): Unit = {
-      shouldWake = true
-      HostAlloc.this.notifyAll()
-    }
-  }
-
   /**
    * A callback class so we know when a non-pinned host buffer was released
    */
-  private class OnCloseCallback(amount: Long) extends MemoryBuffer.EventHandler {
+  private class OnCloseCallback(ptr: Long, amount: Long) extends MemoryBuffer.EventHandler {
     override def onClosed(refCount: Int): Unit = {
       if (refCount == 0) {
-        releaseNonPinned(amount)
+        releaseNonPinned(ptr, amount)
       }
     }
   }
@@ -86,152 +44,26 @@ private class HostAlloc(nonPinnedLimit: Long) {
   /**
    * A callback so we know when a pinned host buffer was released.
    */
-  private class OnPinnedCloseCallback(amount: Long) extends MemoryBuffer.EventHandler {
+  private class OnPinnedCloseCallback(ptr: Long, amount: Long) extends MemoryBuffer.EventHandler {
     override def onClosed(refCount: Int): Unit = {
       if (refCount == 0) {
-        releasePinned(amount)
-      }
-    }
-  }
-
-  /**
-   * A wrapper around a pinned memory reservation so we can add in callbacks as needed.
-   */
-  private class WrappedPinnedReservation(val wrap: HostMemoryReservation)
-    extends HostMemoryReservation {
-
-    private def addEventHandlerAndUpdateMetrics(b: HostMemoryBuffer): HostMemoryBuffer =
-      synchronized {
-        val amount = b.getLength
-        currentPinnedAllocated += amount
-        // I need callbacks for the pinned
-        HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(amount))
-        b
-      }
-
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer =
-      addEventHandlerAndUpdateMetrics(wrap.allocate(amount, preferPinned))
-
-    override def allocate(amount: Long): HostMemoryBuffer =
-      addEventHandlerAndUpdateMetrics(wrap.allocate(amount))
-
-    override def close(): Unit = wrap.close()
-  }
-
-  /**
-   * A non-pinned host memory reservation.
-   */
-  private class NonPinnedReservation(var reservedAmount: Long) extends HostMemoryReservation {
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer = {
-      allocate(amount)
-    }
-
-    override def allocate(amount: Long): HostMemoryBuffer = synchronized {
-      if (amount > reservedAmount) {
-        throw new OutOfMemoryError("Could not allocate. Remaining memory reservation is " +
-          s"too small $amount out of $reservedAmount")
-      }
-      val buff = allocNonPinnedFromReserved(amount)
-      reservedAmount -= align(buff.getLength)
-      buff
-    }
-
-    override def close(): Unit = synchronized {
-      releaseNonPinnedReservation(reservedAmount)
-      reservedAmount = 0
-    }
-  }
-
-  /**
-   * A reservation for the special mode when there are no host memory limits.
-   */
-  private object UnlimitedReservation extends HostMemoryReservation {
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer =
-      HostAlloc.alloc(amount, preferPinned)
-
-    override def allocate(amount: Long): HostMemoryBuffer =
-      HostAlloc.alloc(amount)
-
-    override def close(): Unit = {
-      // NOOP
-    }
-  }
-
-  /**
-   * Wake up any blocked allocation that are still pending up to the amount that has been freed.
-   * Note that this assume that there is no fragmentation that might prevent an allocation from
-   * succeeding.
-   * @param amountLeftToWakeInput the amount of memory that is available in bytes.
-   * @return true if anything was woken up, else false.
-   */
-  private def wakeUpAsNeeded(amountLeftToWakeInput: Long): Boolean = synchronized {
-    var amountLeftToWake = amountLeftToWakeInput
-    var ret = false
-    while (amountLeftToWake > 0 && !pendingAllowedQueue.isEmpty) {
-      val peek = pendingAllowedQueue.peek()
-      if (peek.amount <= amountLeftToWake) {
-        val head = pendingAllowedQueue.poll()
-        amountLeftToWake -= head.amount
-        head.wakeUpItMightBeWorthIt()
-        ret = true
-      } else {
-        return ret
+        releasePinned(ptr, amount)
       }
     }
-    ret
-  }
-
-  private def wakeUpPinned(): Boolean = synchronized {
-    val amountLeftToWake = pinnedLimit - currentPinnedAllocated
-    wakeUpAsNeeded(amountLeftToWake)
-  }
-
-  private def wakeUpNonPinned(): Boolean = synchronized {
-    val amountLeftToWake = nonPinnedLimit - (currentNonPinnedAllocated + currentNonPinnedReserved)
-    wakeUpAsNeeded(amountLeftToWake)
-  }
-
-  private def releasePinned(amount: Long): Unit = synchronized {
-    currentPinnedAllocated -= amount
-    if (wakeUpPinned()) {
-      wakeUpNonPinned()
-    }
-  }
-
-  private def releaseNonPinned(amount: Long): Unit = synchronized {
-    currentNonPinnedAllocated -= amount
-    if (wakeUpNonPinned()) {
-      wakeUpPinned()
-    }
   }
 
-  private def releaseNonPinnedReservation(reservedAmount: Long): Unit = synchronized {
-    currentNonPinnedReserved -= reservedAmount
-    if (wakeUpPinned()) {
-      wakeUpNonPinned()
+  private def releasePinned(ptr: Long, amount: Long): Unit = {
+    synchronized {
+      currentPinnedAllocated -= amount
     }
+    RmmSpark.cpuDeallocate(ptr, amount)
   }
 
-  private def tryReservePinned(amount: Long): Option[HostMemoryReservation] = {
-    val ret = Option(PinnedMemoryPool.tryReserve(amount))
-    ret.map { reservation =>
-      new WrappedPinnedReservation(reservation)
-    }
-  }
-
-  private def tryReserveNonPinned(amount: Long): Option[HostMemoryReservation] = {
-    if (isUnlimited) {
-      Some(UnlimitedReservation)
-    } else {
-      synchronized {
-        if ((currentNonPinnedAllocated + currentNonPinnedReserved + amount) <= nonPinnedLimit) {
-          currentNonPinnedReserved += amount
-          Some(new NonPinnedReservation(amount))
-        } else {
-          None
-        }
-      }
+  private def releaseNonPinned(ptr: Long, amount: Long): Unit = {
+    synchronized {
+      currentNonPinnedAllocated -= amount
     }
+    RmmSpark.cpuDeallocate(ptr, amount)
   }
 
   private def tryAllocPinned(amount: Long): Option[HostMemoryBuffer] = {
@@ -240,34 +72,20 @@ private class HostAlloc(nonPinnedLimit: Long) {
       synchronized {
         currentPinnedAllocated += amount
       }
-      HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(amount))
+      HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(b.getAddress, amount))
     }
     ret
   }
 
-  private def allocNonPinnedFromReserved(amount: Long): HostMemoryBuffer = {
+  private def tryAllocNonPinned(amount: Long): Option[HostMemoryBuffer] = {
     val ret = if (isUnlimited) {
-      HostMemoryBuffer.allocate(amount, false)
-    } else {
       synchronized {
-        currentNonPinnedReserved -= amount
         currentNonPinnedAllocated += amount
-        HostMemoryBuffer.allocate(amount, false)
       }
-    }
-    if (ret == null) {
-      throw new OutOfMemoryError(s"Internal Error: could not allocate non-pinned memory $amount")
-    }
-
-    HostAlloc.addEventHandler(ret, new OnCloseCallback(amount))
-  }
-
-  private def tryAllocNonPinned(amount: Long): Option[HostMemoryBuffer] = {
-    val ret = if (isUnlimited) {
       Some(HostMemoryBuffer.allocate(amount, false))
     } else {
       synchronized {
-        if ((currentNonPinnedAllocated + currentNonPinnedReserved + amount) <= nonPinnedLimit) {
+        if ((currentNonPinnedAllocated + amount) <= nonPinnedLimit) {
           currentNonPinnedAllocated += amount
           Some(HostMemoryBuffer.allocate(amount, false))
         } else {
@@ -276,117 +94,129 @@ private class HostAlloc(nonPinnedLimit: Long) {
       }
     }
     ret.foreach { b =>
-      HostAlloc.addEventHandler(b, new OnCloseCallback(amount))
+      HostAlloc.addEventHandler(b, new OnCloseCallback(b.getAddress, amount))
     }
     ret
   }
 
-  private def canNeverSucceed(amount: Long, preferPinned: Boolean): Boolean = {
+  private def canNeverSucceed(amount: Long, preferPinned: Boolean): Boolean = synchronized {
     val pinnedFailed = (isPinnedOnly || preferPinned) && (amount > pinnedLimit)
     val nonPinnedFailed = isPinnedOnly || (amount > nonPinnedLimit)
     !isUnlimited && pinnedFailed && nonPinnedFailed
   }
 
-  private def checkSize(amount: Long, preferPinned: Boolean): Unit = {
+  private def checkSize(amount: Long, preferPinned: Boolean): Unit = synchronized {
     if (canNeverSucceed(amount, preferPinned)) {
       throw new IllegalArgumentException(s"The amount requested $amount is larger than the " +
           s"maximum pool size ${math.max(pinnedLimit, nonPinnedLimit)}")
     }
   }
 
-  def tryAlloc(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
-    checkSize(amount, preferPinned)
-    val firstPass = if (preferPinned) {
-      tryAllocPinned(amount)
+  private def spillAndCheckRetry(allocSize: Long, retryCount: Long): Boolean = {
+    // check arguments for good measure
+    require(allocSize >= 0,
+      s"spillAndCheckRetry invoked with invalid allocSize $allocSize")
+
+    require(retryCount >= 0,
+      s"spillAndCheckRetry invoked with invalid retryCount $retryCount")
+
+    val store = RapidsBufferCatalog.getHostStorage
+    val storeSize = store.currentSize
+    val storeSpillableSize = store.currentSpillableSize
+    val totalSize: Long = synchronized {
+      currentPinnedAllocated + currentNonPinnedAllocated
+    }
+
+    val attemptMsg = if (retryCount > 0) {
+      s"Attempt $retryCount"
     } else {
-      tryAllocNonPinned(amount)
+      "First attempt"
     }
-    firstPass.orElse {
-      if (preferPinned) {
-        tryAllocNonPinned(amount)
-      } else {
-        tryAllocPinned(amount)
+
+    logInfo(s"Host allocation of $allocSize bytes failed, host store has " +
+        s"$storeSize total and $storeSpillableSize spillable bytes. $attemptMsg.")
+    if (storeSpillableSize == 0) {
+      logWarning(s"Host store exhausted, unable to allocate $allocSize bytes. " +
+          s"Total RMM allocated is $totalSize bytes.")
+      false
+    } else {
+      val targetSize = Math.max(storeSpillableSize - allocSize, 0)
+      logDebug(s"Targeting host store size of $targetSize bytes")
+      // We could not make it work so try and spill enough to make it work
+      val maybeAmountSpilled =
+        RapidsBufferCatalog.synchronousSpill(RapidsBufferCatalog.getHostStorage, allocSize)
+      maybeAmountSpilled.foreach { amountSpilled =>
+        logInfo(s"Spilled $amountSpilled bytes from the host store")
       }
+      true
     }
   }
 
-  def alloc(amount: Long, preferPinned: Boolean = true): HostMemoryBuffer = synchronized {
-    var ret: Option[HostMemoryBuffer] = None
-    var blocked: BlockedAllocation = null
-    do {
-      ret = tryAlloc(amount, preferPinned)
-      if (ret.isEmpty) {
-        blocked = new BlockedAllocation(amount, TaskContext.get().taskAttemptId())
-        pendingAllowedQueue.offer(blocked)
-        var amountSpilled: Option[Long] = None
-        // None for amountSpilled means we need to retry because of a race.
-        // forall returns true for None in this case.
-        while(!blocked.isReady && amountSpilled.forall(_ > 0)) {
-          amountSpilled = RapidsBufferCatalog.synchronousSpill(
-            RapidsBufferCatalog.getHostStorage, amount)
+  private def tryAllocInternal(amount: Long,
+      preferPinned: Boolean,
+      blocking: Boolean): (Option[HostMemoryBuffer], Boolean) = {
+    var retryCount = 0L
+    var ret = Option.empty[HostMemoryBuffer]
+    var shouldRetry = false
+    var shouldRetryInternal = true
+    val isRecursive = RmmSpark.preCpuAlloc(amount, blocking)
+    var allocAttemptFinishedWithoutException = false
+    try {
+      do {
+        val firstPass = if (preferPinned) {
+          tryAllocPinned(amount)
+        } else {
+          tryAllocNonPinned(amount)
         }
-        // Wait until we think we are ready to allocate something
-        blocked.waitUntilPossiblyReady()
-      }
-    } while(ret.isEmpty)
-    ret.get
-  }
-
-  /**
-   * Allocate a buffer at the highest priority possible. If the allocation cannot happen
-   * for whatever reason a None is returned instead of blocking
-   */
-  def allocHighPriority(amount: Long,
-      preferPinned: Boolean = true): Option[HostMemoryBuffer] = synchronized {
-    var ret: Option[HostMemoryBuffer] = None
-    if (!canNeverSucceed(amount, preferPinned)) {
-      ret = tryAlloc(amount, preferPinned)
-      if (ret.isEmpty) {
-        val blocked = new BlockedAllocation(amount, Long.MinValue)
-        pendingAllowedQueue.offer(blocked)
-        var amountSpilled: Option[Long] = None
-        // None for amountSpilled means we need to retry because of a race.
-        // forall returns true for None in this case.
-        while (!blocked.isReady && amountSpilled.forall(_ > 0)) {
-          amountSpilled = RapidsBufferCatalog.synchronousSpill(
-            RapidsBufferCatalog.getHostStorage, amount)
+        ret = firstPass.orElse {
+          if (preferPinned) {
+            tryAllocNonPinned(amount)
+          } else {
+            tryAllocPinned(amount)
+          }
         }
-
-        if (blocked.isReady) {
-          ret = tryAlloc(amount, preferPinned)
-        } else {
-          pendingAllowedQueue.remove(blocked)
+        if (ret.isEmpty) {
+          // We could not make it work so try and spill enough to make it work
+          shouldRetryInternal = spillAndCheckRetry(amount, retryCount)
+          if (shouldRetryInternal) {
+            retryCount += 1
+          }
         }
+      } while(ret.isEmpty && shouldRetryInternal && retryCount < 10)
+      allocAttemptFinishedWithoutException = true
+    } finally {
+      if (ret.isDefined) {
+        RmmSpark.postCpuAllocSuccess(ret.get.getAddress, amount, blocking, isRecursive)
+      } else {
+        // shouldRetry should indicate if spill did anything for us and we should try again.
+        shouldRetry = RmmSpark.postCpuAllocFailed(allocAttemptFinishedWithoutException,
+          blocking, isRecursive)
       }
     }
+    (ret, shouldRetry)
+  }
+
+  def tryAlloc(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
+    if (canNeverSucceed(amount, preferPinned)) {
+      return None
+    }
+    var shouldRetry = true
+    var ret = Option.empty[HostMemoryBuffer]
+    while (shouldRetry) {
+      val (r, sr) = tryAllocInternal(amount, preferPinned, blocking = false)
+      ret = r
+      shouldRetry = sr
+    }
     ret
   }
 
-  def reserve(amount: Long, preferPinned: Boolean): HostMemoryReservation = synchronized {
-    var ret: Option[HostMemoryReservation] = None
-    var blocked: BlockedAllocation = null
-    do {
-      checkSize(amount, preferPinned)
-      val firstPass = if (preferPinned) {
-        tryReservePinned(amount)
-      } else {
-        tryReserveNonPinned(amount)
-      }
-      ret = firstPass.orElse {
-        if (preferPinned) {
-          tryReserveNonPinned(amount)
-        } else {
-          tryReservePinned(amount)
-        }
-      }
-      if (ret.isEmpty) {
-        if (blocked == null) {
-          blocked = new BlockedAllocation(amount, TaskContext.get().taskAttemptId())
-        }
-        pendingAllowedQueue.offer(blocked)
-        blocked.waitUntilPossiblyReady()
-      }
-    } while (ret.isEmpty)
+  def alloc(amount: Long, preferPinned: Boolean = true): HostMemoryBuffer = {
+    checkSize(amount, preferPinned)
+    var ret = Option.empty[HostMemoryBuffer]
+    while (ret.isEmpty) {
+      val (r, _) = tryAllocInternal(amount, preferPinned, blocking = true)
+      ret = r
+    }
     ret.get
   }
 }
@@ -395,11 +225,6 @@ private class HostAlloc(nonPinnedLimit: Long) {
  * A new API for host memory allocation. This can be used to limit the amount of host memory.
  */
 object HostAlloc {
-  private val ALIGNMENT = ColumnView.hostPaddingSizeInBytes
-  private def align(amount: Long): Long = {
-    ((amount + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
-  }
-
   private var singleton: HostAlloc = new HostAlloc(-1)
 
   private def getSingleton: HostAlloc = synchronized {
@@ -418,20 +243,6 @@ object HostAlloc {
     getSingleton.alloc(amount, preferPinned)
   }
 
-  /**
-   * Allocate a HostMemoryBuffer, but at the highest priority. This will not block for a free. It
-   * may spill data to make room for the allocation, but it will do it at the highest priority.
-   * If we cannot make it work, then a None will be returned an whoever tries to use this needs
-   * a backup plan.
-   */
-  def allocHighPriority(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
-    getSingleton.allocHighPriority(amount, preferPinned)
-  }
-
-  def reserve(amount: Long, preferPinned: Boolean = true): HostMemoryReservation = {
-    getSingleton.reserve(amount, preferPinned)
-  }
-
   def addEventHandler(buff: HostMemoryBuffer,
                       handler: MemoryBuffer.EventHandler): HostMemoryBuffer = {
     buff.synchronized {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
index 9e7a0eb7a47..657cbb33dd0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
@@ -130,11 +130,17 @@ class ChunkedPacker(
     tableMeta
   }
 
-  override def hasNext: Boolean = {
-    !closed && chunkedPack.hasNext
+  override def hasNext: Boolean = synchronized {
+    if (closed) {
+      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
+    }
+    chunkedPack.hasNext
   }
 
-  def next(): MemoryBuffer = {
+  def next(): MemoryBuffer = synchronized {
+    if (closed) {
+      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
+    }
     val bytesWritten = chunkedPack.next(bounceBuffer)
     // we increment the refcount because the caller has no idea where
     // this memory came from, so it should close it.
@@ -171,7 +177,7 @@ class RapidsBufferCopyIterator(buffer: RapidsBuffer)
     extends Iterator[MemoryBuffer] with AutoCloseable with Logging {
 
   private val chunkedPacker: Option[ChunkedPacker] = if (buffer.supportsChunkedPacker) {
-    Some(buffer.getChunkedPacker)
+    Some(buffer.makeChunkedPacker)
   } else {
     None
   }
@@ -285,7 +291,10 @@ trait RapidsBuffer extends AutoCloseable {
 
   val supportsChunkedPacker: Boolean = false
 
-  def getChunkedPacker: ChunkedPacker = {
+  /**
+   * Makes a new chunked packer. It is the responsibility of the caller to close this.
+   */
+  def makeChunkedPacker: ChunkedPacker = {
     throw new NotImplementedError("not implemented for this store")
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 2f0a1437e5e..c36da45dd78 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1379,8 +1379,8 @@ object RapidsConf {
   // INTERNAL TEST AND DEBUG CONFIGS
 
   val TEST_RETRY_OOM_INJECTION_ENABLED = conf("spark.rapids.sql.test.injectRetryOOM")
-    .doc("Only to be used in tests. If enabled the retry iterator will inject a RetryOOM " +
-         "once per invocation.")
+    .doc("Only to be used in tests. If enabled the retry iterator will inject a GpuRetryOOM " +
+         "or CpuRetryOOM once per invocation.")
     .internal()
     .booleanConf
     .createWithDefault(false)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
index f0dfd4a53a6..c56806bc965 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
@@ -234,14 +234,6 @@ class RapidsDeviceMemoryStore(
 
     override val supportsChunkedPacker: Boolean = true
 
-    private var initializedChunkedPacker: Boolean = false
-
-    lazy val chunkedPacker: ChunkedPacker = {
-      val packer = new ChunkedPacker(id, table, chunkedPackBounceBuffer)
-      initializedChunkedPacker = true
-      packer
-    }
-
     // This is the current size in batch form. It is to be used while this
     // table hasn't migrated to another store.
     private val unpackedSizeInBytes: Long = GpuColumnVector.getTotalDeviceMemoryUsed(table)
@@ -264,17 +256,20 @@ class RapidsDeviceMemoryStore(
       table.close()
     }
 
-    override def meta: TableMeta = {
-      chunkedPacker.getMeta
+    private lazy val (cachedMeta, cachedPackedSize) = {
+      withResource(makeChunkedPacker) { cp =>
+        (cp.getMeta, cp.getTotalContiguousSize)
+      }
     }
 
+    override def meta: TableMeta = cachedMeta
+
     override val memoryUsedBytes: Long = unpackedSizeInBytes
 
-    override def getPackedSizeBytes: Long = getChunkedPacker.getTotalContiguousSize
+    override def getPackedSizeBytes: Long = cachedPackedSize
 
-    override def getChunkedPacker: ChunkedPacker = {
-      chunkedPacker
-    }
+    override def makeChunkedPacker: ChunkedPacker =
+      new ChunkedPacker(id, table, chunkedPackBounceBuffer)
 
     /**
      * Mark a column as spillable
@@ -329,10 +324,6 @@ class RapidsDeviceMemoryStore(
       // lets remove our handler from the chain of handlers for each column
       removeOnCloseEventHandler()
       super.free()
-      if (initializedChunkedPacker) {
-        chunkedPacker.close()
-        initializedChunkedPacker = false
-      }
     }
 
     private def registerOnCloseEventHandler(): Unit = {
@@ -513,6 +504,7 @@ class RapidsDeviceMemoryStore(
       }
       written
     }
+
   }
   override def close(): Unit = {
     try {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
index d0aa7c413d3..cdcdfea9715 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
@@ -122,7 +122,7 @@ class RapidsHostMemoryStore(
       withResource(other.getCopyIterator) { otherBufferIterator =>
         val isChunked = otherBufferIterator.isChunked
         val totalCopySize = otherBufferIterator.getTotalCopySize
-        closeOnExcept(HostAlloc.allocHighPriority(totalCopySize)) { hb =>
+        closeOnExcept(HostAlloc.tryAlloc(totalCopySize)) { hb =>
           hb.map { hostBuffer =>
             val spillNs = GpuTaskMetrics.get.spillToHostTime {
               var hostOffset = 0L
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
index 4655c64cbbb..dcd99baa480 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
@@ -173,8 +173,8 @@ class RapidsShuffleHeartbeatEndpoint(pluginContext: PluginContext, conf: RapidsC
         .setNameFormat("rapids-shuffle-hb")
         .setDaemon(true)
         .build(),
-        () => RmmSpark.associateCurrentThreadWithShuffle(),
-        () => RmmSpark.removeCurrentThreadAssociation()))
+        null,
+        () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   private class InitializeShuffleManager(ctx: PluginContext,
       shuffleManager: RapidsShuffleInternalManagerBase) extends Runnable {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
index 8be9f37fa55..78fe8c7b31d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
@@ -24,7 +24,7 @@ import com.nvidia.spark.Retryable
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, RmmSparkThreadState, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{CpuRetryOOM, CpuSplitAndRetryOOM, GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark, RmmSparkThreadState}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -186,33 +186,37 @@ object RmmRapidsRetryIterator extends Logging {
   }
 
   /**
-   * Returns a tuple of (shouldRetry, shouldSplit) depending the exception
+   * Returns a tuple of (shouldRetry, shouldSplit, isFromGpuOom) depending the exception
    * passed
    */
-  private def isRetryOrSplitAndRetry(ex: Throwable): (Boolean, Boolean) = {
+  private def isRetryOrSplitAndRetry(ex: Throwable): (Boolean, Boolean, Boolean) = {
     ex match {
-      case _: RetryOOM => (true, false)
-      case _: SplitAndRetryOOM => (true, true)
-      case _ => (false, false)
+      case _: GpuRetryOOM => (true, false, true)
+      case _: CpuRetryOOM => (true, false, false)
+      case _: GpuSplitAndRetryOOM => (true, true, true)
+      case _: CpuSplitAndRetryOOM => (true, true, false)
+      case _ => (false, false, false)
     }
   }
 
   /**
-   * Returns a tuple of (causedByRetry, causedBySplit) depending the exception
+   * Returns a tuple of (causedByRetry, causedBySplit, ifFromGpuoom) depending the exception
    * passed
    */
-  private def causedByRetryOrSplit(ex: Throwable): (Boolean, Boolean) = {
+  private def causedByRetryOrSplit(ex: Throwable): (Boolean, Boolean, Boolean) = {
     var current = ex
     var causedByRetry = false
     var causedBySplit = false
+    var isFromGpuOom = false
     // check if there is a hidden retry or split OOM
     while (current != null && !causedByRetry) {
       current = current.getCause()
-      val (isRetry, isSplit) = isRetryOrSplitAndRetry(current)
+      val (isRetry, isSplit, isGpuOom) = isRetryOrSplitAndRetry(current)
       causedByRetry = isRetry
       causedBySplit = causedBySplit || isSplit
+      isFromGpuOom = isGpuOom
     }
-    (causedByRetry, causedBySplit)
+    (causedByRetry, causedBySplit, isFromGpuOom)
   }
 
   private def isColumnSizeOverflow(ex: Throwable): Boolean =
@@ -242,7 +246,7 @@ object RmmRapidsRetryIterator extends Logging {
     } catch {
       case ex: Throwable =>
         // Only restore on retry exceptions
-        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        val (topLevelIsRetry, _, _) = isRetryOrSplitAndRetry(ex)
         if (topLevelIsRetry || causedByRetryOrSplit(ex)._1 || isOrCausedByColumnSizeOverflow(ex)) {
           r.restore()
         }
@@ -269,7 +273,7 @@ object RmmRapidsRetryIterator extends Logging {
     } catch {
       case ex: Throwable =>
         // Only restore on retry exceptions
-        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        val (topLevelIsRetry, _, _) = isRetryOrSplitAndRetry(ex)
         if (topLevelIsRetry || causedByRetryOrSplit(ex)._1 || isOrCausedByColumnSizeOverflow(ex)) {
           r.foreach(_.restore())
         }
@@ -342,14 +346,17 @@ object RmmRapidsRetryIterator extends Logging {
     override def hasNext: Boolean
 
     /**
-     * Split is a function that is invoked by `RmmRapidsRetryIterator` when `SplitAndRetryOOM`
+     * Split is a function that is invoked by `RmmRapidsRetryIterator` when `GpuSplitAndRetryOOM`
+     * or `CpuSplitAndRetryOOM`
      * is thrown. This function is implemented by `Spliterator` classes to attempt to handle
      * this exception by reducing the size of attempts (the thing that `.next` is
      * using as an input), usually by splitting a batch in half by number of rows, or
      * splitting a collection of batches into smaller collections to be attempted separately,
      * likely reducing GPU memory that needs to be manifested while calling `.next`.
+     * @param isFromGpuOom true if the split happened because of a GPU OOM. Otherwise it was a
+     *                     CPU off heap OOM.
      */
-    def split(): Unit
+    def split(isFromGpuOom: Boolean): Unit
 
     override def next(): K
 
@@ -367,8 +374,12 @@ object RmmRapidsRetryIterator extends Logging {
 
     override def hasNext: Boolean = !wasCalledSuccessfully
 
-    override def split(): Unit = {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+    override def split(isFromGpuOom: Boolean): Unit = {
+      if (isFromGpuOom) {
+        throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+      } else {
+        throw new CpuSplitAndRetryOOM("CPU OutOfMemory: could not split inputs and retry")
+      }
     }
 
     override def next(): K = {
@@ -431,12 +442,16 @@ object RmmRapidsRetryIterator extends Logging {
 
     override def hasNext: Boolean = input.hasNext || attemptStack.nonEmpty
 
-    override def split(): Unit = {
+    override def split(isFromGpuOom: Boolean): Unit = {
       // If `split` OOMs, we are already the last thread standing
       // there is likely not much we can do, and for now we don't handle
       // this OOM
       if (splitPolicy == null) {
-        throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+        if (isFromGpuOom) {
+          throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+        } else {
+          throw new CpuSplitAndRetryOOM("CPU OutOfMemory: could not split inputs and retry")
+        }
       }
       // splitPolicy must take ownership of the argument
       val splitted = splitPolicy(attemptStack.pop())
@@ -513,8 +528,10 @@ object RmmRapidsRetryIterator extends Logging {
    */
   class RmmRapidsRetryIterator[T, K](attemptIter: Spliterator[K])
       extends Iterator[K] {
+    // We want to be sure that retry will work in all cases
+    TaskRegistryTracker.registerThreadForRetry()
     // used to figure out if we should inject an OOM (only for tests)
-    private val config = new RapidsConf(SQLConf.get)
+    private val config = Option(SQLConf.get).map(new RapidsConf(_))
 
     // this is true if an OOM was injected (only for tests)
     private var injectedOOM = false
@@ -526,7 +543,8 @@ object RmmRapidsRetryIterator extends Logging {
     private def clearInjectedOOMIfNeeded(): Unit = {
       if (injectedOOM && !injectedOOMCleared) {
         val threadId = RmmSpark.getCurrentThreadId
-        // if for some reason we don't throw, or we throw something that isn't a RetryOOM
+        // if for some reason we don't throw, or we throw something that isn't a GpuRetryOOM
+        // or CpuRetryOOM
         // we want to remove the retry we registered before we leave the withRetry block.
         // If the thread is in an UNKNOWN state, then it is already cleared.
         if (RmmSpark.getStateOf(threadId) != RmmSparkThreadState.UNKNOWN) {
@@ -543,27 +561,33 @@ object RmmRapidsRetryIterator extends Logging {
       var firstAttempt: Boolean = true
       var result: Option[K] = None
       var doSplit = false
+      var isFromGpuOom = true
       while (result.isEmpty && attemptIter.hasNext) {
         if (!firstAttempt) {
           // call thread block API
           try {
             RmmSpark.blockThreadUntilReady()
           } catch {
-            case _: SplitAndRetryOOM => doSplit = true
+            case _: GpuSplitAndRetryOOM =>
+              doSplit = true
+              isFromGpuOom = true
+            case _: CpuSplitAndRetryOOM =>
+              doSplit = true
+              isFromGpuOom = false
           }
         }
         firstAttempt = false
         if (doSplit) {
-          attemptIter.split()
+          attemptIter.split(isFromGpuOom)
         }
         doSplit = false
         try {
           // call the user's function
-          if (config.testRetryOOMInjectionEnabled && !injectedOOM) {
+          if (config.exists(_.testRetryOOMInjectionEnabled) && !injectedOOM) {
             injectedOOM = true
             // ensure we have associated our thread with the running task, as
             // `forceRetryOOM` requires a prior association.
-            RmmSpark.associateCurrentThreadWithTask(TaskContext.get().taskAttemptId())
+            RmmSpark.currentThreadIsDedicatedToTask(TaskContext.get().taskAttemptId())
             RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
           }
           result = Some(attemptIter.next())
@@ -571,15 +595,17 @@ object RmmRapidsRetryIterator extends Logging {
         } catch {
           case ex: Throwable =>
             // handle a retry as the top-level exception
-            val (topLevelIsRetry, topLevelIsSplit) = isRetryOrSplitAndRetry(ex)
+            val (topLevelIsRetry, topLevelIsSplit, isGpuOom) = isRetryOrSplitAndRetry(ex)
             doSplit = topLevelIsSplit
+            isFromGpuOom = isGpuOom
 
             // handle any retries that are wrapped in a different top-level exception
             var causedByRetry = false
             if (!topLevelIsRetry) {
-              val (cbRetry, cbSplit) = causedByRetryOrSplit(ex)
+              val (cbRetry, cbSplit, isGpuOom) = causedByRetryOrSplit(ex)
               causedByRetry = cbRetry
               doSplit = doSplit || cbSplit
+              isFromGpuOom = isGpuOom
             }
 
             clearInjectedOOMIfNeeded()
@@ -614,7 +640,8 @@ object RmmRapidsRetryIterator extends Logging {
   /**
    * Common split function from a single SpillableColumnarBatch to a sequence of them,
    * that tries to split the input into two chunks. If the input cannot be split in two,
-   * because we are down to 1 row, this function throws `SplitAndRetryOOM`.
+   * because we are down to 1 row, this function throws `GpuSplitAndRetryOOM` or
+   * `CpuSplitAndRetryOOM`.
    *
    * Note how this function closes the input `spillable` that is passed in.
    *
@@ -625,7 +652,7 @@ object RmmRapidsRetryIterator extends Logging {
       withResource(spillable) { _ =>
         val toSplitRows = spillable.numRows()
         if (toSplitRows <= 1) {
-          throw new SplitAndRetryOOM(
+          throw new GpuSplitAndRetryOOM(
             s"GPU OutOfMemory: a batch of $toSplitRows cannot be split!")
         }
         val (firstHalf, secondHalf) = withResource(spillable.getColumnarBatch()) { src =>
@@ -665,7 +692,7 @@ object RmmRapidsRetryIterator extends Logging {
       withResource(target) { _ =>
         val newTarget = target.targetSize / 2
         if (newTarget < target.minSize) {
-          throw new SplitAndRetryOOM(
+          throw new GpuSplitAndRetryOOM(
             s"GPU OutOfMemory: targetSize: ${target.targetSize} cannot be split further!" +
                 s" minimum: ${target.minSize}")
         }
@@ -677,9 +704,9 @@ object RmmRapidsRetryIterator extends Logging {
 /**
  * This is a wrapper that turns a target size into an autocloseable to allow it to be used
  * in withRetry blocks.  It is intended to be used to help with cases where the split calculation
- * happens inside the retry block, and depends on the target size.  On a SplitAndRetryOOM,
- * a split policy like `splitTargetSizeInHalf` can be used to retry the block with a smaller target
- * size.
+ * happens inside the retry block, and depends on the target size.  On a `GpuSplitAndRetryOOM` or
+ * `CpuSplitAndRetryOOM`, a split policy like `splitTargetSizeInHalf` can be used to retry the
+ * block with a smaller target size.
  */
 case class AutoCloseableTargetSize(targetSize: Long, minSize: Long) extends AutoCloseable {
   override def close(): Unit = ()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala
new file mode 100644
index 00000000000..d768f727cec
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.jni.RmmSpark
+import java.util
+
+import org.apache.spark.TaskContext
+
+/**
+ * This handles keeping track of task threads and registering them with RMMSpark
+ * as needed. This is here to provide an efficient and lazy way to make sure that
+ * we can use the Retry API behind the scenes without having to have callbacks
+ * whenever a task starts, or trying to inject code in all of the operators that
+ * would first start on the GPU.
+ */
+object TaskRegistryTracker {
+  private val taskToThread = new util.HashMap[Long, ArrayBuffer[Long]]()
+  private val registeredThreads = new util.HashSet[Long]()
+
+  /**
+   * Clear the registry. This is used for tests
+   */
+  def clearRegistry(): Unit = synchronized {
+    val copied = new java.util.HashSet(taskToThread.keySet())
+    copied.forEach { taskId =>
+      taskIsDone(taskId)
+    }
+  }
+
+  private def taskIsDone(taskId: Long): Unit = synchronized {
+    val threads = taskToThread.remove(taskId)
+    if (threads != null) {
+      threads.foreach(registeredThreads.remove)
+      RmmSpark.taskDone(taskId)
+    }
+  }
+
+  def registerThreadForRetry(): Unit = synchronized {
+    val tc = TaskContext.get()
+    if (tc != null) {
+      // If we don't have a TaskContext we are either in a test or in some other thread
+      // If it is some other thread, then they are responsible to amke sure things are
+      // registered properly themselves. If it is a test, well you need to update your
+      // test code to make this work properly.
+      val threadId = RmmSpark.getCurrentThreadId
+      val taskId = tc.taskAttemptId()
+      if (registeredThreads.add(threadId)) {
+        RmmSpark.currentThreadIsDedicatedToTask(taskId)
+        if (!taskToThread.containsKey(taskId)) {
+          taskToThread.put(taskId, ArrayBuffer(threadId))
+          ScalableTaskCompletion.onTaskCompletion(tc) {
+            taskIsDone(taskId)
+          }
+        } else {
+          taskToThread.get(taskId) += threadId
+        }
+      }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
index c7cb9e88b52..0852fb77ccc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
@@ -26,7 +26,7 @@ import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry, withRetryNoSplit}
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import com.nvidia.spark.rapids.shims._
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
@@ -1097,7 +1097,7 @@ private[rapids] class GpuRangeIterator(
     (rowsNumber) => {
       withResource(rowsNumber) { _ =>
         if (rowsNumber.value < 10) {
-          throw new SplitAndRetryOOM(s"GPU OutOfMemory: the number of rows generated is" +
+          throw new GpuSplitAndRetryOOM(s"GPU OutOfMemory: the number of rows generated is" +
             s" too small to be split ${rowsNumber.value}!")
         }
         Seq(AutoCloseableLong(rowsNumber.value / 2))
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
index c5fba7cfc6a..6b887fb1765 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, Cuda, DeviceMemoryBuffer, NvtxColor, NvtxRange, Rmm}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.format.TableMeta
+import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.internal.Logging
 
@@ -177,7 +178,7 @@ class BufferReceiveState(
       closeOnExcept(new ArrayBuffer[DeviceMemoryBuffer]()) { toClose =>
         val results = currentBlocks.flatMap { b =>
           val pendingTransferRequest = b.block.request
-
+          RmmSpark.shuffleThreadWorkingOnTasks(pendingTransferRequest.handler.getTaskIds)
           val fullSize = pendingTransferRequest.tableMeta.bufferMeta().size()
 
           var contigBuffer: DeviceMemoryBuffer = null
@@ -233,6 +234,10 @@ class BufferReceiveState(
         // unless all that data has truly moved to our final buffer in our stream
         stream.sync()
 
+        results.foreach { result =>
+          RmmSpark.poolThreadFinishedForTasks(result.handler.getTaskIds)
+        }
+
         // cpu is in sync, we can recycle the bounce buffer
         if (!toFinalize.isEmpty) {
           val firstCb = toFinalize.pop()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
index 0e6e27bef40..b73f9820bad 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
@@ -56,6 +56,11 @@ trait RapidsShuffleFetchHandler {
    * @param errorMessage - a string containing an error message
    */
   def transferError(errorMessage: String, throwable: Throwable = null): Unit
+
+  /**
+   * Called to get the task attempt ids that this shuffle handler is for.
+   */
+  def getTaskIds: Array[Long]
 }
 
 /**
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
index a0baa928d42..185310e35ed 100644
--- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
@@ -77,6 +77,7 @@ class RapidsShuffleIterator(
     blocksByAddress: Array[(BlockManagerId, Seq[(BlockId, Long, Int)])],
     metricsUpdater: ShuffleMetricsUpdater,
     sparkTypes: Array[DataType],
+    taskAttemptId: Long,
     catalog: ShuffleReceivedBufferCatalog = GpuShuffleEnv.getReceivedCatalog,
     timeoutSeconds: Long = GpuShuffleEnv.shuffleFetchTimeoutSeconds)
   extends Iterator[ColumnarBatch]
@@ -224,7 +225,11 @@ class RapidsShuffleIterator(
           private[this] var clientExpectedBatches = 0L
           private[this] var clientResolvedBatches = 0L
 
-          def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
+          private[this] val taskIds = Array[Long](taskAttemptId)
+
+          override def getTaskIds: Array[Long] = taskIds
+
+          override def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
             if (expectedBatches == 0) {
               throw new IllegalStateException(
                 s"Received an invalid response from shuffle server: " +
@@ -234,7 +239,7 @@ class RapidsShuffleIterator(
             batchesInFlight = batchesInFlight + expectedBatches
             totalBatchesExpected = totalBatchesExpected + expectedBatches
             clientExpectedBatches = expectedBatches
-            logDebug(s"Task: $taskAttemptId Client $blockManagerId " +
+            logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId " +
                 s"Expecting $expectedBatches batches, $batchesInFlight batches currently in " +
                 s"flight, total expected by this client: $clientExpectedBatches, total " +
                 s"resolved by this client: $clientResolvedBatches")
@@ -243,7 +248,7 @@ class RapidsShuffleIterator(
           def clientDone: Boolean = clientExpectedBatches > 0 &&
             clientExpectedBatches == clientResolvedBatches
 
-          def batchReceived(handle: RapidsBufferHandle): Boolean = {
+          override def batchReceived(handle: RapidsBufferHandle): Boolean = {
             resolvedBatches.synchronized {
               if (taskComplete) {
                 false
@@ -259,11 +264,11 @@ class RapidsShuffleIterator(
                   resolvedBatches.offer(BufferReceived(handle))
 
                   if (clientDone) {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"done fetching batches. Total batches expected $clientExpectedBatches, " +
                         s"total batches resolved $clientResolvedBatches.")
                   } else {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"NOT done fetching batches. Total batches expected " +
                         s"$clientExpectedBatches, total batches resolved $clientResolvedBatches.")
                   }
@@ -305,7 +310,7 @@ class RapidsShuffleIterator(
   private[this] def receiveBufferCleaner(): Unit = resolvedBatches.synchronized {
     taskComplete = true
     if (hasNext) {
-      logWarning(s"Iterator for task ${taskAttemptId} closing, " +
+      logWarning(s"Iterator for task ${taskAttemptIdStr} closing, " +
           s"but it is not done. Closing ${resolvedBatches.size()} resolved batches!!")
       resolvedBatches.forEach {
         case BufferReceived(handle) =>
@@ -323,10 +328,8 @@ class RapidsShuffleIterator(
     }
   }
 
-  // Used to print log messages, defaulting to a value for unit tests
-  private[this] lazy val taskAttemptId: String =
-    taskContext.map(_.taskAttemptId().toString)
-        .getOrElse("testTaskAttempt")
+  // Used to print log messages
+  private[this] lazy val taskAttemptIdStr: String = taskAttemptId.toString
 
   private[this] val taskContext: Option[TaskContext] = Option(TaskContext.get())
 
@@ -356,8 +359,7 @@ class RapidsShuffleIterator(
     // fetches and so it could produce device memory. Note this is not allowing for some external
     // thread to schedule the fetches for us, it may be something we consider in the future, given
     // memory pressure.
-    // No good way to get a metric in here for semaphore time.
-    taskContext.foreach(GpuSemaphore.acquireIfNecessary(_))
+    taskContext.foreach(GpuSemaphore.acquireIfNecessary)
 
     if (!started) {
       // kick off if we haven't already
@@ -367,11 +369,11 @@ class RapidsShuffleIterator(
 
     val blockedStart = System.currentTimeMillis()
     var result: Option[ShuffleClientResult] = None
-    RmmSpark.threadCouldBlockOnShuffle()
+    RmmSpark.waitingOnPool()
     try {
       result = pollForResult(timeoutSeconds)
     } finally {
-      RmmSpark.threadDoneWithShuffle()
+      RmmSpark.doneWaitingOnPool()
     }
     val blockedTime = System.currentTimeMillis() - blockedStart
 
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
index 0b118532a28..f9e9119aae8 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
@@ -176,7 +176,7 @@ class RapidsCachingReader[K, C](
 
         val cbArrayFromUcx: Iterator[(K, C)] = if (blocksForRapidsTransport.nonEmpty) {
           val rapidsShuffleIterator = new RapidsShuffleIterator(localId, rapidsConf, transport.get,
-            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes)
+            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes, context.taskAttemptId())
           rapidsShuffleIterator.map(cb => {
             (0, cb)
           }).asInstanceOf[Iterator[(K, C)]]
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
index 4b5d58354ce..39613e9e99c 100644
--- a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
@@ -64,6 +64,7 @@ class RapidsShuffleIterator(
     blocksByAddress: Array[(BlockManagerId, collection.Seq[(BlockId, Long, Int)])],
     metricsUpdater: ShuffleMetricsUpdater,
     sparkTypes: Array[DataType],
+    taskAttemptId: Long,
     catalog: ShuffleReceivedBufferCatalog = GpuShuffleEnv.getReceivedCatalog,
     timeoutSeconds: Long = GpuShuffleEnv.shuffleFetchTimeoutSeconds)
   extends Iterator[ColumnarBatch]
@@ -211,6 +212,10 @@ class RapidsShuffleIterator(
           private[this] var clientExpectedBatches = 0L
           private[this] var clientResolvedBatches = 0L
 
+          private[this] val taskIds = Array[Long](taskAttemptId)
+
+          override def getTaskIds: Array[Long] = taskIds
+
           def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
             if (expectedBatches == 0) {
               throw new IllegalStateException(
@@ -221,7 +226,7 @@ class RapidsShuffleIterator(
             batchesInFlight = batchesInFlight + expectedBatches
             totalBatchesExpected = totalBatchesExpected + expectedBatches
             clientExpectedBatches = expectedBatches
-            logDebug(s"Task: $taskAttemptId Client $blockManagerId " +
+            logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId " +
                 s"Expecting $expectedBatches batches, $batchesInFlight batches currently in " +
                 s"flight, total expected by this client: $clientExpectedBatches, total " +
                 s"resolved by this client: $clientResolvedBatches")
@@ -246,11 +251,11 @@ class RapidsShuffleIterator(
                   resolvedBatches.offer(BufferReceived(handle))
 
                   if (clientDone) {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"done fetching batches. Total batches expected $clientExpectedBatches, " +
                         s"total batches resolved $clientResolvedBatches.")
                   } else {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"NOT done fetching batches. Total batches expected " +
                         s"$clientExpectedBatches, total batches resolved $clientResolvedBatches.")
                   }
@@ -292,7 +297,7 @@ class RapidsShuffleIterator(
   private[this] def receiveBufferCleaner(): Unit = resolvedBatches.synchronized {
     taskComplete = true
     if (hasNext) {
-      logWarning(s"Iterator for task ${taskAttemptId} closing, " +
+      logWarning(s"Iterator for task ${taskAttemptIdStr} closing, " +
           s"but it is not done. Closing ${resolvedBatches.size()} resolved batches!!")
       resolvedBatches.forEach {
         case BufferReceived(handle) =>
@@ -310,10 +315,8 @@ class RapidsShuffleIterator(
     }
   }
 
-  // Used to print log messages, defaulting to a value for unit tests
-  private[this] lazy val taskAttemptId: String =
-    taskContext.map(_.taskAttemptId().toString)
-        .getOrElse("testTaskAttempt")
+  // Used to print log messages
+  private[this] lazy val taskAttemptIdStr: String = taskAttemptId.toString
 
   private[this] val taskContext: Option[TaskContext] = Option(TaskContext.get())
 
@@ -354,11 +357,11 @@ class RapidsShuffleIterator(
 
     val blockedStart = System.currentTimeMillis()
     var result: Option[ShuffleClientResult] = None
-    RmmSpark.threadCouldBlockOnShuffle()
+    RmmSpark.waitingOnPool()
     try {
       result = pollForResult(timeoutSeconds)
     } finally {
-      RmmSpark.threadDoneWithShuffle()
+      RmmSpark.doneWaitingOnPool()
     }
     val blockedTime = System.currentTimeMillis() - blockedStart
 
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
index 953e3a3ff0b..f0325b7d36d 100644
--- a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
@@ -165,7 +165,7 @@ class RapidsCachingReader[K, C](
 
         val cbArrayFromUcx: Iterator[(K, C)] = if (blocksForRapidsTransport.nonEmpty) {
           val rapidsShuffleIterator = new RapidsShuffleIterator(localId, rapidsConf, transport.get,
-            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes)
+            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes, context.taskAttemptId())
           rapidsShuffleIterator.map(cb => {
             (0, cb)
           }).asInstanceOf[Iterator[(K, C)]]
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
index 5e73221d546..d6d6fa85073 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
@@ -18,8 +18,9 @@ package com.nvidia.spark.rapids
 
 import java.util.concurrent.{ExecutionException, Future, LinkedBlockingQueue, TimeoutException, TimeUnit}
 
-import ai.rapids.cudf.{HostMemoryBuffer, HostMemoryReservation, PinnedMemoryPool, Rmm, RmmAllocationMode}
+import ai.rapids.cudf.{HostMemoryBuffer, PinnedMemoryPool, Rmm, RmmAllocationMode}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
+import com.nvidia.spark.rapids.jni.{RmmSpark, RmmSparkThreadState}
 import org.mockito.Mockito.when
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 import org.scalatest.concurrent.{Signaler, TimeLimits}
@@ -29,10 +30,13 @@ import org.scalatestplus.mockito.MockitoSugar.mock
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
 
 class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     BeforeAndAfterAll with TimeLimits {
+  private val sqlConf = new SQLConf()
+  private val rc = new RapidsConf(sqlConf)
 
   def setMockContext(taskAttemptId: Long): Unit = {
     val context = mock[TaskContext]
@@ -114,16 +118,13 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
   class TaskThread(private val name: String, private val taskId: Long) extends Thread(name) {
     private val queue = new LinkedBlockingQueue[TaskThreadOp[_]]()
-    private var inDoIt: Boolean = false
+    private var nativeThreadId = -1L
 
     def initialize(): Unit = {
       setDaemon(true)
       start()
       val waitForStart = doIt(new TaskThreadOp[Void]() {
-        override def doIt(): Void = {
-          setMockContext(taskId)
-          null
-        }
+        override def doIt(): Void = null
 
         override def toString: String = s"INIT TASK $name TASK $taskId"
       })
@@ -138,29 +139,26 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
     def waitForBlockedOnAlloc(): Unit = {
       val start = System.nanoTime()
-      var (state, inDo) = synchronized {
-        (getState, inDoIt)
-      }
-      while (!isBlockedState(state) && inDo) {
+      var state = RmmSpark.getStateOf(nativeThreadId)
+      while (!isBlockedState(state)) {
         val end = System.nanoTime()
         if (TimeUnit.SECONDS.toNanos(1) <= (end - start)) {
           throw new TimeoutException(s"$name in $state after ${end - start} ns")
         }
         Thread.sleep(10)
         synchronized {
-          state = getState
-          inDo = inDoIt
+          state = RmmSpark.getStateOf(nativeThreadId)
         }
       }
     }
 
-    private def isBlockedState(state: Thread.State): Boolean = state match {
-      case Thread.State.BLOCKED | Thread.State.WAITING | Thread.State.TIMED_WAITING => true
+    private def isBlockedState(state: RmmSparkThreadState): Boolean = state match {
+      case RmmSparkThreadState.THREAD_BUFN | RmmSparkThreadState.THREAD_BLOCKED => true
       case _ => false
     }
 
     def isBlocked: Boolean = synchronized {
-      isBlockedState(getState) && inDoIt
+      isBlockedState(RmmSpark.getStateOf(nativeThreadId))
     }
 
     def doIt[T](op: TaskThreadOp[T]): Future[T] = {
@@ -172,22 +170,26 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
     override def run(): Unit = {
       try {
-        while (true) {
-          val op = queue.poll(1000, TimeUnit.MILLISECONDS)
-          if (op.isInstanceOf[TaskThread.TaskThreadDoneOp]) return
-          // null is returned from the queue on a timeout
-          if (op != null) {
-            synchronized {
-              inDoIt = true
-            }
-            try {
-              op.doIt()
-            } finally {
-              synchronized {
-                inDoIt = false
+        this.nativeThreadId = RmmSpark.getCurrentThreadId
+        setMockContext(taskId)
+        RmmSpark.currentThreadIsDedicatedToTask(taskId)
+        try {
+          // Without this the retry does not work properly
+          SQLConf.withExistingConf(sqlConf) {
+            var isDone = false
+            while (!isDone) {
+              val op = queue.poll()
+              if (op.isInstanceOf[TaskThread.TaskThreadDoneOp]) {
+                isDone = true
+              } else if (op != null) {
+                op.doIt()
+              } else {
+                Thread.`yield`()
               }
             }
           }
+        } finally {
+          RmmSpark.removeCurrentDedicatedThreadAssociation(taskId)
         }
       } catch {
         case t: Throwable =>
@@ -262,11 +264,13 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     }
 
     private def doAlloc(): Void = {
-      val tmp = HostAlloc.alloc(size, preferPinned)
-      synchronized {
-        closeOnExcept(tmp) { _ =>
-          assert(b.isEmpty)
-          b = Option(tmp)
+      RmmRapidsRetryIterator.withRetryNoSplit {
+        val tmp = HostAlloc.alloc(size, preferPinned)
+        synchronized {
+          closeOnExcept(tmp) { _ =>
+            assert(b.isEmpty)
+            b = Option(tmp)
+          }
         }
       }
       null
@@ -292,66 +296,6 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     }
   }
 
-  class ReserveOnAnotherThread(val thread: TaskThread,
-      val size: Long,
-      val preferPinned: Boolean = true) extends AutoCloseable {
-    var b: Option[HostMemoryReservation] = None
-    val fb: Future[Void] = thread.doIt(new TaskThreadOp[Void] {
-      override def doIt(): Void = {
-        doReservation()
-        null
-      }
-
-      override def toString: String = "RESERVE(" + size + ")"
-    })
-    var fc: Option[Future[Void]] = None
-
-    def waitForReservation(): HostMemoryReservation = {
-      fb.get(1000, TimeUnit.MILLISECONDS)
-      getReservation()
-    }
-
-    def getReservation(): HostMemoryReservation = synchronized {
-      b.getOrElse {
-        throw new IllegalStateException("No reservation was found")
-      }
-    }
-
-    def closeOnThread(): Unit = {
-      if (fc.isDefined) throw new IllegalStateException("free called multiple times")
-      fc = Option(thread.doIt(new TaskThreadOp[Void]() {
-        override def doIt(): Void = {
-          close()
-          null
-        }
-
-        override def toString: String = "CLOSE(" + size + ")"
-      }))
-    }
-
-    def waitForClose(): Unit = {
-      if (fc.isEmpty) closeOnThread()
-      fc.get.get(1000, TimeUnit.MILLISECONDS)
-    }
-
-    def closeAndWait(): Unit = {
-      waitForClose()
-    }
-
-    private def doReservation(): Void = {
-      val tmp = HostAlloc.reserve(size, preferPinned)
-      synchronized {
-        b = Option(tmp)
-      }
-      null
-    }
-
-    override def close(): Unit = synchronized {
-      b.foreach(_.close())
-      b = None
-    }
-  }
-
   object MyThreadSignaler extends Signaler {
     override def apply(testThread: Thread): Unit = {
       System.err.println("\n\n\t\tTEST THREAD APPEARS TO BE STUCK")
@@ -381,7 +325,6 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
     PinnedMemoryPool.shutdown()
     HostAlloc.initialize(-1)
-    val rc = new RapidsConf(Map.empty[String, String])
     RapidsBufferCatalog.init(rc)
   }
 
@@ -416,10 +359,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
           assert(got2.isEmpty)
         }
       }
-
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -445,9 +386,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
         }
       }
 
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -478,9 +418,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
         }
       }
 
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -717,175 +656,4 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
       }
     }
   }
-
-  test("simple pinned reservation") {
-    PinnedMemoryPool.initialize(4 * 1024)
-    HostAlloc.initialize(0)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                  assertThrows[OutOfMemoryError] {
-                    withResource(ra.allocate(1)) { _ =>
-
-                    }
-                  }
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 1)) { a2 =>
-            // We ran out of memory, and this should have blocked
-            thread2.waitForBlockedOnAlloc()
-
-            a.closeOnThread()
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
-
-  test("simple non-pinned reservation") {
-    PinnedMemoryPool.initialize(0)
-    HostAlloc.initialize(4 * 1024)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                  assertThrows[OutOfMemoryError] {
-                    withResource(ra.allocate(1)) { _ =>
-
-                    }
-                  }
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 1)) { a2 =>
-            // We ran out of memory, and this should have blocked
-            thread2.waitForBlockedOnAlloc()
-
-            a.closeOnThread()
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
-
-
-  test("simple mixed reservation") {
-    PinnedMemoryPool.initialize(4 * 1024)
-    HostAlloc.initialize(4 * 1024)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-      val thread3 = new TaskThread("thread3", 3)
-      thread3.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 4096)) { a2 =>
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-
-            withResource(new AllocOnAnotherThread(thread3, 1024)) { a3 =>
-              // We ran out of memory, and this should have blocked
-              thread2.waitForBlockedOnAlloc()
-
-              a.closeOnThread()
-
-              a3.waitForAlloc()
-              a3.assertAllocSize(1024)
-              a3.freeAndWait()
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-        thread3.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
 }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
index 6c9f59e8ece..24a252a4e68 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
@@ -52,7 +52,7 @@ class BatchWithPartitionDataSuite extends RmmSparkRetrySuiteBase with SparkQuery
   }
 
   test("test adding partition values to batch with OOM split and retry - unhandled") {
-    // This test uses single-row partition values that should throw a SplitAndRetryOOM exception
+    // This test uses single-row partition values that should throw a GpuSplitAndRetryOOM exception
     // when a retry is forced.
     val maxGpuColumnSizeBytes = 1000L
     withGpuSparkSession(_ => {
@@ -62,7 +62,7 @@ class BatchWithPartitionDataSuite extends RmmSparkRetrySuiteBase with SparkQuery
           Array(1), partValues.take(1), partSchema, maxGpuColumnSizeBytes)
         RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
         withResource(resultBatchIter) { _ =>
-          assertThrows[SplitAndRetryOOM] {
+          assertThrows[GpuSplitAndRetryOOM] {
             resultBatchIter.next()
           }
         }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
index 2080d79b3ca..c74739edfd8 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.Table
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.ArgumentMatchers.any
 import org.mockito.Mockito.{doAnswer, spy, times, verify}
 import org.mockito.invocation.InvocationOnMock
@@ -164,7 +164,7 @@ class GeneratedInternalRowToCudfRowIteratorRetrySuite
         ctriter, schema, TargetSize(1),
         NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric)
       RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         myIter.next()
       }
       assertResult(0)(RapidsBufferCatalog.getDeviceStorage.currentSize)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
index 51390189fb1..7db437be6d6 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
@@ -20,7 +20,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.Table
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatestplus.mockito.MockitoSugar
 
@@ -133,7 +133,7 @@ class GpuCoalesceBatchesRetrySuite
     allBatches.foreach(_.close())
   }
 
-  test("coalesce gpu batches splits in half with SplitAndRetryOOM") {
+  test("coalesce gpu batches splits in half with GpuSplitAndRetryOOM") {
     val iters = getIters(injectSplitAndRetry = 1)
     iters.foreach { iter =>
       withResource(iter.next()) { coalesced =>
@@ -148,7 +148,7 @@ class GpuCoalesceBatchesRetrySuite
     }
   }
 
-  test("coalesce gpu batches splits in quarters with SplitAndRetryOOM") {
+  test("coalesce gpu batches splits in quarters with GpuSplitAndRetryOOM") {
     val iters = getIters(injectSplitAndRetry = 2)
     iters.foreach { iter =>
       withResource(iter.next()) { coalesced =>
@@ -170,7 +170,7 @@ class GpuCoalesceBatchesRetrySuite
   test("coalesce gpu batches fails with OOM if it cannot split enough") {
     val iters = getIters(mockInjectSplitAndRetry = true)
     iters.foreach { iter =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         iter.next() // throws
       }
       val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
@@ -190,10 +190,10 @@ class GpuCoalesceBatchesRetrySuite
     }
   }
 
-  test("coalesce gpu batches throws if SplitAndRetryOOM with non-splittable goal") {
+  test("coalesce gpu batches throws if GpuSplitAndRetryOOM with non-splittable goal") {
     val iters = getIters(injectSplitAndRetry = 1, goal = RequireSingleBatch)
     iters.foreach { iter =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         iter.next()
       }
       val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
@@ -209,7 +209,7 @@ class GpuCoalesceBatchesRetrySuite
     override def numRows(): Int = 0
     override def setSpillPriority(priority: Long): Unit = {}
     override def getColumnarBatch(): ColumnarBatch = {
-      throw new SplitAndRetryOOM()
+      throw new GpuSplitAndRetryOOM()
     }
     override def sizeInBytes: Long = 0
     override def dataTypes: Array[DataType] = Array.empty
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
index 8e9285d670f..257f8549b0d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark}
 import org.scalatestplus.mockito.MockitoSugar
 
 import org.apache.spark.sql.catalyst.expressions.{Ascending, AttributeReference, ExprId, SortOrder}
@@ -53,12 +53,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-sort RetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-sort GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSortExp = new RetryOOM())
+      firstPassSortExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -68,12 +68,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-sort SplitAndRetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-sort GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSortExp = new SplitAndRetryOOM())
+      firstPassSortExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -83,12 +83,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-split RetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-split GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSplitExp = new RetryOOM())
+      firstPassSplitExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -98,25 +98,25 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when first-pass-split SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when first-pass-split GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSplitExp = new SplitAndRetryOOM())
+      firstPassSplitExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
   }
 
-  test("GPU out-of-core sort with retry when merge-sort-split RetryOOM") {
+  test("GPU out-of-core sort with retry when merge-sort-split GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      mergeSortExp = new RetryOOM())
+      mergeSortExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       var numRows = 0
       while(outCoreIter.hasNext) {
@@ -128,25 +128,25 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when merge-sort-split SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when merge-sort-split GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      mergeSortExp = new SplitAndRetryOOM())
+      mergeSortExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
   }
 
-  test("GPU out-of-core sort with retry when concat-output RetryOOM") {
+  test("GPU out-of-core sort with retry when concat-output GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      concatOutExp = new RetryOOM())
+      concatOutExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       var numRows = 0
       while (outCoreIter.hasNext) {
@@ -158,14 +158,14 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when concat-output SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when concat-output GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      concatOutExp = new SplitAndRetryOOM())
+      concatOutExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
@@ -206,7 +206,7 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU each batch sort with RetryOOM") {
+  test("GPU each batch sort with GpuRetryOOM") {
     val eachBatchIter = new GpuSortEachBatchIterator(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
@@ -228,14 +228,14 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU each batch sort throws SplitAndRetryOOM") {
+  test("GPU each batch sort throws GpuSplitAndRetryOOM") {
     val inputIter = Iterator(buildBatch, buildBatch)
     val eachBatchIter = new GpuSortEachBatchIterator(
       inputIter,
       gpuSorter,
       singleBatch = false)
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       eachBatchIter.next()
     }
     inputIter.foreach(_.close())
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
index 45ecb8a70d8..971ab905f6e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
@@ -65,7 +65,7 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
   }
 
   test("basic retry") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val expr = GpuAlias(GpuAdd(
         GpuBoundReference(0, LongType, true)(NamedExpression.newExprId, "a"),
@@ -90,12 +90,12 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
   test("tiered retry") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val a = AttributeReference("a", LongType)()
       val b = AttributeReference("b", LongType)()
@@ -121,12 +121,12 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
   test("AST retry with split") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val a = AttributeReference("a", LongType)()
       val b = AttributeReference("b", LongType)()
@@ -166,7 +166,7 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
index a8e0ad550ea..876c05f3631 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
@@ -32,6 +32,7 @@ class RmmSparkRetrySuiteBase extends AnyFunSuite with BeforeAndAfterEach {
     super.beforeEach()
     SparkSession.getActiveSession.foreach(_.stop())
     SparkSession.clearActiveSession()
+    RmmSpark.clearEventHandler()
     if (!Rmm.isInitialized) {
       rmmWasInitialized = true
       Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
@@ -46,14 +47,14 @@ class RmmSparkRetrySuiteBase extends AnyFunSuite with BeforeAndAfterEach {
     RapidsBufferCatalog.setCatalog(catalog)
     val mockEventHandler = new BaseRmmEventHandler()
     RmmSpark.setEventHandler(mockEventHandler)
-    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+    RmmSpark.currentThreadIsDedicatedToTask(1)
   }
 
   override def afterEach(): Unit = {
     super.afterEach()
     SparkSession.getActiveSession.foreach(_.stop())
     SparkSession.clearActiveSession()
-    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.removeAllCurrentThreadAssociation()
     RmmSpark.clearEventHandler()
     RapidsBufferCatalog.close()
     GpuSemaphore.shutdown()
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
index d050a9944c7..5f7c84bf652 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids
 
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
@@ -39,7 +39,7 @@ class RowToColumnarIteratorRetrySuite extends RmmSparkRetrySuiteBase {
     val row2ColIter = new RowToColumnarIterator(
       rowIter, schema, RequireSingleBatch, new GpuRowToColumnConverter(schema))
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       row2ColIter.next()
     }
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
index 93892cfd7c1..e1f5b309d6e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf._
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatestplus.mockito.MockitoSugar
 
@@ -62,7 +62,7 @@ class WindowRetrySuite
     it
   }
 
-  test("row based window handles RetryOOM") {
+  test("row based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -83,7 +83,7 @@ class WindowRetrySuite
     }
   }
 
-  test("optimized-row based window handles RetryOOM") {
+  test("optimized-row based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -104,7 +104,7 @@ class WindowRetrySuite
     }
   }
 
-  test("ranged based window handles RetryOOM") {
+  test("ranged based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RangeFrame,
       GpuLiteral.create(-1, IntegerType),
@@ -127,7 +127,7 @@ class WindowRetrySuite
     }
   }
 
-  test("SplitAndRetryOOM is not handled in doAggs") {
+  test("GpuSplitAndRetryOOM is not handled in doAggs") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -135,14 +135,14 @@ class WindowRetrySuite
     val it = setupWindowIterator(frame)
     val inputBatch = it.onDeck.get
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId, 1)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       it.next()
     }
     verify(inputBatch, times(1)).getColumnarBatch()
     verify(inputBatch, times(1)).close()
   }
 
-  test("row based group by window handles RetryOOM") {
+  test("row based group by window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -172,7 +172,7 @@ class WindowRetrySuite
     }
   }
 
-  test("row-based group by running window handles SplitAndRetryOOM") {
+  test("row-based group by running window handles GpuSplitAndRetryOOM") {
     val runningFrame = GpuSpecifiedWindowFrame(RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding), GpuSpecialFrameBoundary(CurrentRow))
     val boundOrderSpec = SortOrder(
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
index c808a36af85..73618bd81ef 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
@@ -20,7 +20,7 @@ import ai.rapids.cudf.{Rmm, RmmAllocationMode, RmmEventHandler, Table}
 import com.nvidia.spark.Retryable
 import com.nvidia.spark.rapids.Arm.withResource
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitTargetSizeInHalf, withRestoreOnRetry, withRetry, withRetryNoSplit}
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.funsuite.AnyFunSuite
@@ -58,11 +58,11 @@ class WithRetrySuite
     RapidsBufferCatalog.setCatalog(catalog)
     val mockEventHandler = new BaseRmmEventHandler()
     RmmSpark.setEventHandler(mockEventHandler)
-    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+    RmmSpark.currentThreadIsDedicatedToTask(1)
   }
 
   override def afterEach(): Unit = {
-    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.removeAllCurrentThreadAssociation()
     RmmSpark.clearEventHandler()
     RapidsBufferCatalog.close()
     if (rmmWasInitialized) {
@@ -117,7 +117,7 @@ class WithRetrySuite
         withRetry(myItems.iterator, mockSplitPolicy) { _ =>
           if (!didThrow) {
             didThrow = true
-            throw new SplitAndRetryOOM("in tests")
+            throw new GpuSplitAndRetryOOM("in tests")
           } else {
             throw new IllegalStateException("unhandled exception")
           }
@@ -136,10 +136,10 @@ class WithRetrySuite
 
   test("withRetry closes input on missing split policy") {
     val myItems = Seq(buildBatch, buildBatch)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       try {
         withRetry(myItems.iterator, splitPolicy = null) { _ =>
-          throw new SplitAndRetryOOM("unhandled split-and-retry")
+          throw new GpuSplitAndRetryOOM("unhandled split-and-retry")
         }.toSeq
       } finally {
         verify(myItems.head, times(1)).close()
@@ -161,7 +161,7 @@ class WithRetrySuite
           myCheckpointable.value += increment
           if (!didThrow) {
             didThrow = true
-            throw new RetryOOM("in tests")
+            throw new GpuRetryOOM("in tests")
           }
         }
       }
@@ -183,7 +183,7 @@ class WithRetrySuite
             myCheckpointable.value += increment
             if (!didThrow) {
               val ex = new IllegalStateException()
-              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              ex.addSuppressed(new GpuRetryOOM("causedby ex in tests"))
               throw ex
               didThrow = true
             }
@@ -208,7 +208,7 @@ class WithRetrySuite
           myCheckpointables.foreach(_.value += increment)
           if (!didThrow) {
             didThrow = true
-            throw new RetryOOM("in tests")
+            throw new GpuRetryOOM("in tests")
           }
         }
       }
@@ -233,7 +233,7 @@ class WithRetrySuite
             if (!didThrow) {
               didThrow = true
               val ex = new IllegalStateException()
-              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              ex.addSuppressed(new GpuRetryOOM("causedby ex in tests"))
               throw ex
             }
           }
@@ -257,7 +257,7 @@ class WithRetrySuite
         lastSplitSize = attempt.targetSize
         if (doThrow > 0) {
           doThrow = doThrow - 1
-          throw new SplitAndRetryOOM("in tests")
+          throw new GpuSplitAndRetryOOM("in tests")
         }
       }.toSeq
     } finally {
@@ -274,12 +274,12 @@ class WithRetrySuite
     var lastSplitSize = 0L
     val myTarget = AutoCloseableTargetSize(initialValue, minValue)
     try {
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         withRetry(myTarget, splitTargetSizeInHalf) { attempt =>
           lastSplitSize = attempt.targetSize
           if (doThrow > 0) {
             doThrow = doThrow - 1
-            throw new SplitAndRetryOOM("in tests")
+            throw new GpuSplitAndRetryOOM("in tests")
           }
         }.toSeq
       }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
index 22fee313b17..e9873b0bc5f 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
@@ -22,6 +22,7 @@ import ai.rapids.cudf.{DeviceMemoryBuffer, HostMemoryBuffer}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.ShuffleMetadata
 import com.nvidia.spark.rapids.format.{BufferMeta, TableMeta}
+import com.nvidia.spark.rapids.jni.RmmSpark
 import org.mockito.{ArgumentCaptor, ArgumentMatchers}
 import org.mockito.ArgumentMatchers._
 import org.mockito.Mockito._
@@ -70,6 +71,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful metadata fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
     val contigBuffSize = 100000
@@ -77,6 +81,8 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
     val (tableMetas, response) =
       RapidsShuffleTestHelper.mockMetaResponse(mockTransaction, contigBuffSize, numBatches)
 
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     // initialize metadata fetch
     client.doFetch(shuffleRequests.map(_._1).toSeq, mockHandler)
 
@@ -104,12 +110,17 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful degenerate metadata fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
     val numBatches = 3
 
     RapidsShuffleTestHelper.mockDegenerateMetaResponse(mockTransaction, numBatches)
 
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     // initialize metadata fetch
     client.doFetch(shuffleRequests.map(_._1).toSeq, mockHandler)
 
@@ -128,6 +139,11 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   private def doTestErrorOrCancelledMetadataFetch(status: TransactionStatus.Value): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     when(mockTransaction.getStatus).thenReturn(status)
     when(mockTransaction.getErrorMessage).thenReturn(Some("Error/cancel occurred"))
 
@@ -159,6 +175,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("exception in metadata fetch escalates to handler"){
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenThrow(new RuntimeException("test exception"))
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
 
@@ -178,6 +198,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 25001
@@ -250,6 +274,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch - but handler rejected it") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     when(mockHandler.batchReceived(any())).thenReturn(false) // reject incoming batches
 
@@ -294,6 +322,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch multi-buffer") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 500
@@ -346,6 +378,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch multi-buffer, larger than a single bounce buffer") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 500
@@ -399,6 +435,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   private def doTestErrorOrCancelledBufferFetch(status: TransactionStatus.Value): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(status)
     when(mockTransaction.getErrorMessage).thenReturn(Some(s"Status is: ${status}"))
 
@@ -450,6 +490,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
     val mockTable = RapidsShuffleTestHelper.mockTableMeta(numRows)
     when(ptr.getLength).thenReturn(mockTable.bufferMeta().size())
     when(ptr.tableMeta).thenReturn(mockTable)
+    when(ptr.handler).thenReturn(mockHandler)
     val buff = HostMemoryBuffer.allocate(mockTable.bufferMeta().size())
     fillBuffer(buff)
     (ptr, buff, mockTable)
@@ -476,6 +517,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   def endToEndTest(buff: BounceBuffer,
                    expected: Seq[ReceivedBufferWindow],
                    ptrBuffs: Seq[(PendingTransferRequest, HostMemoryBuffer, TableMeta)]): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     withResource(ptrBuffs.map(_._2)) { sources =>
       withResource(new BufferReceiveState(123, buff, ptrBuffs.map(_._1), () => {})) { br =>
         val blocks = sources.map(x => new MockBlock(x))
@@ -623,6 +668,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("on endpoint failure the iterator is notified if it is registered") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val metaResp = ShuffleMetadata.buildMetaResponse(Seq.empty)
     when(mockTransaction.releaseMessage()).thenReturn(
@@ -637,6 +685,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("on endpoint failure the iterator is not notified if it is done (unregistered)") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val metaResp = ShuffleMetadata.buildMetaResponse(Seq.empty)
     when(mockTransaction.releaseMessage()).thenReturn(
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
index 4e0325f9048..70064682ed0 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
@@ -29,7 +29,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("inability to get a client raises a fetch failure") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = new RapidsShuffleIterator(
@@ -39,6 +39,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123)
 
@@ -60,7 +61,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   private def doTestErrorOrCancelledRaisesFetchFailure(status: TransactionStatus.Value): Unit = {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       when(mockTransaction.getStatus).thenReturn(status)
 
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
@@ -72,6 +73,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -109,7 +111,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a transport exception raises a fetch failure with the cause exception") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = spy(new RapidsShuffleIterator(
@@ -119,6 +121,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -158,7 +161,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a timeout while waiting for batches raises a fetch failure") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = spy(new RapidsShuffleIterator(
@@ -168,6 +171,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -193,7 +197,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a new good batch is queued") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = new RapidsShuffleIterator(
@@ -203,6 +207,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123)
 
diff --git a/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala b/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
index 37e71eeb1e4..5aaeae2c7b9 100644
--- a/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
+++ b/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids
 import ai.rapids.cudf.TableWriter
 import com.nvidia.spark.rapids.{ColumnarOutputWriter, ColumnarOutputWriterFactory, GpuBoundReference, GpuColumnVector, RapidsBufferCatalog, RapidsDeviceMemoryStore, ScalableTaskCompletion}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
-import com.nvidia.spark.rapids.jni.{RetryOOM, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FSDataOutputStream
 import org.apache.hadoop.mapred.TaskAttemptContext
@@ -502,7 +502,7 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
         // throw once from bufferBatchAndClose to simulate an exception after we call the
         // stats tracker
         mockOutputWriter.throwOnNextBufferBatchAndClose(
-          new SplitAndRetryOOM("mocking a split and retry"))
+          new GpuSplitAndRetryOOM("mocking a split and retry"))
         val dynamicConcurrentWriter =
           prepareDynamicPartitionConcurrentWriter(maxWriters = 5, batchSize = 1)
 
@@ -510,7 +510,7 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
           dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
           dynamicConcurrentWriter.commit()
         } else {
-          assertThrows[SplitAndRetryOOM] {
+          assertThrows[GpuSplitAndRetryOOM] {
             dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
             dynamicConcurrentWriter.commit()
           }
@@ -557,11 +557,11 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
         when(mockJobDescription.statsTrackers)
             .thenReturn(Seq(jobTracker))
         when(statsTracker.newBatch(any(), any()))
-            .thenThrow(new RetryOOM("mocking a retry"))
+            .thenThrow(new GpuRetryOOM("mocking a retry"))
         val dynamicConcurrentWriter =
           prepareDynamicPartitionConcurrentWriter(maxWriters = 5, batchSize = 1)
 
-        assertThrows[RetryOOM] {
+        assertThrows[GpuRetryOOM] {
           dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
           dynamicConcurrentWriter.commit()
         }

From 6d35e466f2ec40500ab78df1caf965be9ba311a2 Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Thu, 30 Nov 2023 09:12:20 +0800
Subject: [PATCH 06/15] Fix a hang for Pandas UDFs on DB 13.3[databricks]
 (#9833)

fix #9493
fix #9844

The python runner uses two separate threads to write and read data with Python processes,
however on DB13.3, it becomes single-threaded, which means reading and writing run on the same thread.
Now the first reading is always ahead of the first writing. But the original BatchQueue will wait
on the first reading until the first writing is done. Then it will wait forever.

Change made:

- Update the BatchQueue to support asking for a batch instead of waiting unitl one is inserted into the queue.
   This can eliminate the order requirement of reading and writing.
- Introduce a new class named BatchProducer to work with the new BatchQueue to support rows number
   peek on demand for the reading.
- Apply this new BatchQueue to relevant plans.
- Update the Python runners to support writing one batch one time for the singled-threaded model.
- Found an issue about PythonUDAF and RunningWindoFunctionExec, it may be a bug specific to DB 13.3,
   and add a test (test_window_aggregate_udf_on_cpu) for it.
- Other small refactors
---------

Signed-off-by: Firestarman <firestarmanllc@gmail.com>
---
 .../src/main/python/spark_session.py          |   3 +
 integration_tests/src/main/python/udf_test.py |  17 +-
 .../execution/python/BatchGroupUtils.scala    |  70 +++++--
 .../python/GpuAggregateInPandasExec.scala     |  95 ++++-----
 .../python/GpuArrowEvalPythonExec.scala       | 183 +++++++++++++-----
 .../python/GpuArrowPythonRunner.scala         | 145 +++++++++-----
 .../python/GpuWindowInPandasExecBase.scala    |  33 ++--
 .../spark/rapids/shims/PythonUDFShim.scala    |   1 -
 .../shims/GpuCoGroupedArrowPythonRunner.scala |   2 +-
 .../python/shims/GpuPythonArrowShims.scala    |  29 +--
 .../rapids/shims/GpuWindowInPandasExec.scala  |  40 ++--
 .../shims/GpuGroupUDFArrowPythonRunner.scala  |   4 +-
 .../rapids/shims/Spark341PlusDBShims.scala    |  31 ++-
 .../shims/GpuCoGroupedArrowPythonRunner.scala |  63 +++---
 .../shims/GpuGroupUDFArrowPythonRunner.scala  |  47 ++---
 .../python/shims/GpuPythonArrowShims.scala    |  30 +--
 16 files changed, 491 insertions(+), 302 deletions(-)

diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
index aa27503c8eb..50eaa7c49a9 100644
--- a/integration_tests/src/main/python/spark_session.py
+++ b/integration_tests/src/main/python/spark_session.py
@@ -158,6 +158,9 @@ def is_spark_330_or_later():
 def is_spark_340_or_later():
     return spark_version() >= "3.4.0"
 
+def is_spark_341():
+    return spark_version() == "3.4.1"
+
 def is_spark_350_or_later():
     return spark_version() >= "3.5.0"
 
diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py
index 88281279162..9e3f5d05bcc 100644
--- a/integration_tests/src/main/python/udf_test.py
+++ b/integration_tests/src/main/python/udf_test.py
@@ -15,7 +15,7 @@
 import pytest
 
 from conftest import is_at_least_precommit_run, is_not_utc
-from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later
+from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_341
 
 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version
 
@@ -43,12 +43,6 @@
 import pyarrow
 from typing import Iterator, Tuple
 
-
-if is_databricks_runtime() and is_spark_340_or_later():
-    # Databricks 13.3 does not use separate reader/writer threads for Python UDFs
-    # which can lead to hangs. Skipping these tests until the Python UDF handling is updated.
-    pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493")
-
 arrow_udf_conf = {
     'spark.sql.execution.arrow.pyspark.enabled': 'true',
     'spark.rapids.sql.exec.WindowInPandasExec': 'true',
@@ -182,7 +176,10 @@ def group_size_udf(to_process: pd.Series) -> int:
 
 low_upper_win = Window.partitionBy('a').orderBy('b').rowsBetween(-3, 3)
 
-udf_windows = [no_part_win, unbounded_win, cur_follow_win, pre_cur_win, low_upper_win]
+running_win_param = pytest.param(pre_cur_win, marks=pytest.mark.xfail(
+    condition=is_databricks_runtime() and is_spark_341(),
+    reason='DB13.3 wrongly uses RunningWindowFunctionExec to evaluate a PythonUDAF and it will fail even on CPU'))
+udf_windows = [no_part_win, unbounded_win, cur_follow_win, running_win_param, low_upper_win]
 window_ids = ['No_Partition', 'Unbounded', 'Unbounded_Following', 'Unbounded_Preceding',
               'Lower_Upper']
 
@@ -338,8 +335,8 @@ def create_df(spark, data_gen, left_length, right_length):
 @ignore_order
 @pytest.mark.parametrize('data_gen', [ShortGen(nullable=False)], ids=idfn)
 def test_cogroup_apply_udf(data_gen):
-    def asof_join(l, r):
-        return pd.merge_asof(l, r, on='a', by='b')
+    def asof_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+        return pd.merge_ordered(left, right)
 
     def do_it(spark):
         left, right = create_df(spark, data_gen, 500, 500)
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
index b97415b31ba..132c46b9ba7 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
@@ -22,12 +22,14 @@ import ai.rapids.cudf
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.withRetryNoSplit
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.rapids.execution.GpuSubPartitionHashJoin
 import org.apache.spark.sql.rapids.execution.python.shims.GpuPythonArrowOutput
 import org.apache.spark.sql.rapids.shims.DataTypeUtilsShim
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -398,34 +400,68 @@ class CombiningIterator(
     numOutputRows: GpuMetric,
     numOutputBatches: GpuMetric) extends Iterator[ColumnarBatch] {
 
-  // For `hasNext` we are waiting on the queue to have something inserted into it
-  // instead of waiting for a result to be ready from Python. The reason for this
-  // is to let us know the target number of rows in the batch that we want when reading.
-  // It is a bit hacked up but it works. In the future when we support spilling we should
-  // store the number of rows separate from the batch. That way we can get the target batch
-  // size out without needing to grab the GpuSemaphore which we cannot do if we might block
-  // on a read operation.
-  override def hasNext: Boolean = inputBatchQueue.hasNext || pythonOutputIter.hasNext
+  // This is only for the input.
+  private var pendingInput: Option[SpillableColumnarBatch] = None
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(pendingInput.foreach(_.close())))
+
+  // The Python output should line up row for row so we only look at the Python output
+  // iterator and no need to check the `inputPending` who will be consumed when draining
+  // the Python output.
+  override def hasNext: Boolean = pythonOutputIter.hasNext
 
   override def next(): ColumnarBatch = {
-    val numRows = inputBatchQueue.peekBatchSize
+    val numRows = inputBatchQueue.peekBatchNumRows()
     // Updates the expected batch size for next read
-    pythonArrowReader.setMinReadTargetBatchSize(numRows)
+    pythonArrowReader.setMinReadTargetNumRows(numRows)
     // Reads next batch from Python and combines it with the input batch by the left side.
     withResource(pythonOutputIter.next()) { cbFromPython =>
-      assert(cbFromPython.numRows() == numRows)
-      withResource(inputBatchQueue.remove()) { origBatch =>
+      // Here may get a batch has a larger rows number than the current input batch.
+      assert(cbFromPython.numRows() >= numRows,
+        s"Expects >=$numRows rows but got ${cbFromPython.numRows()} from the Python worker")
+      withResource(concatInputBatch(cbFromPython.numRows())) { concated =>
         numOutputBatches += 1
         numOutputRows += numRows
-        combine(origBatch, cbFromPython)
+        GpuColumnVector.combineColumns(concated, cbFromPython)
       }
     }
   }
 
-  private def combine(lBatch: ColumnarBatch, rBatch: ColumnarBatch): ColumnarBatch = {
-    val lColumns = GpuColumnVector.extractColumns(lBatch).map(_.incRefCount())
-    val rColumns = GpuColumnVector.extractColumns(rBatch).map(_.incRefCount())
-    new ColumnarBatch(lColumns ++ rColumns, lBatch.numRows())
+  private def concatInputBatch(targetNumRows: Int): ColumnarBatch = {
+    withResource(mutable.ArrayBuffer[SpillableColumnarBatch]()) { buf =>
+      var curNumRows = pendingInput.map(_.numRows()).getOrElse(0)
+      pendingInput.foreach(buf.append(_))
+      pendingInput = None
+      while (curNumRows < targetNumRows) {
+        val scb = inputBatchQueue.remove()
+        if (scb != null) {
+          buf.append(scb)
+          curNumRows = curNumRows + scb.numRows()
+        }
+      }
+      assert(buf.nonEmpty, "The input queue is empty")
+
+      if (curNumRows > targetNumRows) {
+        // Need to split the last batch
+        val Array(first, second) = withRetryNoSplit(buf.remove(buf.size - 1)) { lastScb =>
+          val splitIdx = lastScb.numRows() - (curNumRows - targetNumRows)
+          withResource(lastScb.getColumnarBatch()) { lastCb =>
+            val batchTypes = GpuColumnVector.extractTypes(lastCb)
+            withResource(GpuColumnVector.from(lastCb)) { table =>
+              table.contiguousSplit(splitIdx).safeMap(
+                SpillableColumnarBatch(_, batchTypes, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+            }
+          }
+        }
+        buf.append(first)
+        pendingInput = Some(second)
+      }
+
+      val ret = GpuSubPartitionHashJoin.concatSpillBatchesAndClose(buf.toSeq)
+      // "ret" should be non empty because we checked the buf is not empty ahead.
+      withResource(ret.get) { concatedScb =>
+        concatedScb.getColumnarBatch()
+      }
+    } // end of withResource(mutable.ArrayBuffer)
   }
 
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
index 9c02d231706..caf323ec053 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
@@ -20,9 +20,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
 
@@ -141,9 +140,7 @@ case class GpuAggregateInPandasExec(
 
     // Start processing
     child.executeColumnar().mapPartitionsInternal { inputIter =>
-      val queue: BatchQueue = new BatchQueue()
       val context = TaskContext.get()
-      onTaskCompletion(queue.close())
 
       if (isPythonOnGpuEnabled) {
         GpuPythonHelper.injectGpuInfo(pyFuncs, isPythonOnGpuEnabled)
@@ -164,51 +161,56 @@ case class GpuAggregateInPandasExec(
       }
 
       // Second splits into separate group batches.
-      val miniAttrs = gpuGroupingExpressions ++ allInputs
-      val pyInputIter = BatchGroupedIterator(miniIter, miniAttrs.asInstanceOf[Seq[Attribute]],
-          groupingRefs.indices)
-        .map { groupedBatch =>
-          // Resolves the group key and the python input from a grouped batch. Then
-          //  - Caches the key to be combined with the Python output later. And
-          //  - Returns the python input to be sent to Python later.
-          withResource(groupedBatch) { grouped =>
-            // key batch.
-            // No `safeMap` because here does not increase the ref count.
-            // (`Seq.indices.map()` is NOT lazy, so it is safe to be used to slice the columns.)
-            val keyCudfColumns = groupingRefs.indices.map(
-              grouped.column(_).asInstanceOf[GpuColumnVector].getBase)
-            val keyBatch = if (keyCudfColumns.isEmpty) {
-              // No grouping columns, then the whole batch is a group. Returns the dedicated batch
-              // as the group key.
-              // This batch means there is only one empty row, just like the 'new UnsafeRow()'
-              // used in Spark. The row number setting to 1 is because Python returns only one row
-              // as the aggregate result for the whole batch, and 'CombiningIterator' requires the
-              // the same row number for both the key batch and the result batch to be combined.
-              new ColumnarBatch(Array(), 1)
-            } else {
-              // Uses `cudf.Table.gather` to pick the first row in each group as the group key.
-              // Doing this is because
-              //   - The Python worker produces only one row as the aggregate result,
-              //   - The key rows in a group are equal to each other.
-              //
-              // (Now this is done group by group, so the performance would not be good when
-              //  there are too many small groups.)
-              withResource(new cudf.Table(keyCudfColumns: _*)) { table =>
-                withResource(cudf.ColumnVector.fromInts(0)) { gatherMap =>
-                  withResource(table.gather(gatherMap)) { oneRowTable =>
-                    GpuColumnVector.from(oneRowTable, groupingRefs.map(_.dataType).toArray)
-                  }
-                }
+      val miniAttrs = (gpuGroupingExpressions ++ allInputs).asInstanceOf[Seq[Attribute]]
+      val keyConverter = (groupedBatch: ColumnarBatch) => {
+        // No `safeMap` because here does not increase the ref count.
+        // (`Seq.indices.map()` is NOT lazy, so it is safe to be used to slice the columns.)
+        val keyCudfColumns = groupingRefs.indices.map(
+          groupedBatch.column(_).asInstanceOf[GpuColumnVector].getBase)
+        if (keyCudfColumns.isEmpty) {
+          // No grouping columns, then the whole batch is a group. Returns the dedicated batch
+          // as the group key.
+          // This batch means there is only one empty row, just like the 'new UnsafeRow()'
+          // used in Spark. The row number setting to 1 is because Python returns only one row
+          // as the aggregate result for the whole batch, and 'CombiningIterator' requires the
+          // the same row number for both the key batch and the result batch to be combined.
+          new ColumnarBatch(Array(), 1)
+        } else {
+          // Uses `cudf.Table.gather` to pick the first row in each group as the group key.
+          // Doing this is because
+          //   - The Python worker produces only one row as the aggregate result,
+          //   - The key rows in a group are equal to each other.
+          //
+          // (Now this is done group by group, so the performance would not be good when
+          //  there are too many small groups.)
+          withResource(new cudf.Table(keyCudfColumns: _*)) { table =>
+            withResource(cudf.ColumnVector.fromInts(0)) { gatherMap =>
+              withResource(table.gather(gatherMap)) { oneRowTable =>
+                GpuColumnVector.from(oneRowTable, groupingRefs.map(_.dataType).toArray)
               }
             }
-            queue.add(keyBatch)
+          }
+        }
+      }
 
-            // Python input batch
-            val pyInputColumns = pyInputRefs.indices.safeMap { idx =>
-              grouped.column(idx + groupingRefs.size).asInstanceOf[GpuColumnVector].incRefCount()
-            }
-            new ColumnarBatch(pyInputColumns.toArray, groupedBatch.numRows())
+      val batchProducer = new BatchProducer(
+        BatchGroupedIterator(miniIter, miniAttrs, groupingRefs.indices))
+      val queue = new BatchQueue(batchProducer, Some(keyConverter))
+      val pyInputIter = batchProducer.asIterator.map { case (batch, isForPeek) =>
+        val inputBatch = closeOnExcept(batch) { _ =>
+          val pyInputColumns = pyInputRefs.indices.safeMap { idx =>
+            batch.column(idx + groupingRefs.size).asInstanceOf[GpuColumnVector].incRefCount()
           }
+          new ColumnarBatch(pyInputColumns.toArray, batch.numRows())
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // When adding batch to the queue, queue will convert it to a key batch because this
+          // queue is constructed with the key converter.
+          queue.add(batch)
+        }
+        inputBatch
       }
 
       // Third, sends to Python to execute the aggregate and returns the result.
@@ -223,8 +225,7 @@ case class GpuAggregateInPandasExec(
           pythonRunnerConf,
           // The whole group data should be written in a single call, so here is unlimited
           Int.MaxValue,
-          DataTypeUtilsShim.fromAttributes(pyOutAttributes),
-          () => queue.finish())
+          DataTypeUtilsShim.fromAttributes(pyOutAttributes))
 
         val pyOutputIterator = pyRunner.compute(pyInputIter, context.partitionId(), context)
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
index 45fec7c81d2..60b6b3929e1 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf._
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -171,62 +171,143 @@ class RebatchingRoundoffIterator(
 }
 
 /**
- * A simple queue that holds the pending batches that need to line up with
- * and combined with batches coming back from python
+ * Work with BatchQueue to support BatchQueue's peek operation by pulling
+ * in a batch from the input iterator on demand.
+ *
+ * It also supports accessing batches from the input by an iterator. Call
+ * "asIterator" to get the iterator. This iterator will return a tuple of
+ * ColumnarBatch and Boolean. And the boolean indicates whether the batch
+ * is pulled in for peak.
  */
-class BatchQueue extends AutoCloseable {
-  private val queue: mutable.Queue[SpillableColumnarBatch] =
-    mutable.Queue[SpillableColumnarBatch]()
-  private var isSet = false
-
-  def add(batch: ColumnarBatch): Unit = synchronized {
-    queue.enqueue(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
-    if (!isSet) {
-      // Wake up anyone waiting for the first batch.
-      isSet = true
-      notifyAll()
+class BatchProducer(input: Iterator[ColumnarBatch]) extends AutoCloseable { producer =>
+
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(close()))
+
+  // Cache for batches pulled in by the "produce" call for the peek operation.
+  // In fact, there is usually only one batch. But using a queue here is because in
+  // theory "produce" can be called multiple times, then more than one batch can be
+  // pulled in.
+  private val pending = mutable.Queue[SpillableColumnarBatch]()
+
+  private[rapids] def produce(): ColumnarBatch = producer.synchronized {
+    if (input.hasNext) {
+      val cb = input.next()
+      // Need to duplicate this batch for "next"
+      pending.enqueue(SpillableColumnarBatch(GpuColumnVector.incRefCounts(cb),
+        SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+      cb
+    } else {
+      null
     }
   }
 
-  def finish(): Unit = synchronized {
-    if (!isSet) {
-      // Wake up anyone waiting for the first batch.
-      isSet = true
-      notifyAll()
+  def asIterator: Iterator[(ColumnarBatch, Boolean)] = {
+    new Iterator[(ColumnarBatch, Boolean)] {
+
+      override def hasNext: Boolean = producer.synchronized {
+        pending.nonEmpty || input.hasNext
+      }
+
+      override def next(): (ColumnarBatch, Boolean) = producer.synchronized {
+        if (!hasNext) {
+          throw new NoSuchElementException()
+        }
+        if (pending.nonEmpty) {
+          withResource(pending.dequeue()) { scb =>
+            (scb.getColumnarBatch(), true)
+          }
+        } else {
+          (input.next(), false)
+        }
+      }
+    }
+  }
+
+  override def close(): Unit = synchronized {
+    while(pending.nonEmpty) {
+      pending.dequeue().close()
     }
   }
+}
 
-  def remove(): ColumnarBatch = synchronized {
+/**
+ * A simple queue that holds the pending batches that need to line up with
+ * and combined with batches coming back from python.
+ *
+ * It will ask for a batch from "batchProducer" when peeking the rows number
+ * and the queue is empty.
+ * It also supports an optional converter to convert the input batch and save
+ * the converted batch. This is design for the GpuAggregateInPandasExec to save
+ * the group key instead of the original input batch.
+ */
+class BatchQueue(
+    batchProducer: BatchProducer,
+    converter: Option[ColumnarBatch => ColumnarBatch] = None
+) extends AutoCloseable {
+
+  assert(batchProducer != null, "BatchQueue requires a BatchProducer")
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(close()))
+
+  private val queue = mutable.ArrayBuffer[SpillableColumnarBatch]()
+
+  private[this] def convertIfAny(batch: ColumnarBatch): ColumnarBatch = {
+    converter.map { convert =>
+      withResource(batch)(convert)
+    }.getOrElse(batch)
+  }
+
+  /** Add a batch to the queue, the input batch will be taken over, do not use it anymore */
+  def add(batch: ColumnarBatch): Unit = {
+    val cb = convertIfAny(batch)
+    this.synchronized {
+      queue.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+    }
+  }
+
+  /** Return and remove the first batch in the cache. */
+  def remove(): SpillableColumnarBatch = synchronized {
     if (queue.isEmpty) {
       null
     } else {
-      withResource(queue.dequeue()) { scp =>
-        scp.getColumnarBatch()
-      }
+      queue.remove(0)
     }
   }
 
-  def hasNext: Boolean = synchronized {
-    if (!isSet) {
-      wait()
+  /** Get the number of rows in the next batch, without actually getting the batch. */
+  def peekBatchNumRows(): Int = {
+    val isEmpty = this.synchronized {
+      queue.isEmpty
+    }
+    if (isEmpty) {
+      // Try to ask for the next batch instead of waiting for inserting a
+      // batch by the python runner's writing. Because the writing may
+      // happen after this peak in the single threaded python runner, leading
+      // to a hang.
+      // Do not call it inside a lock to avoid any dead lock.
+      val nextBatch = batchProducer.produce()
+      if (nextBatch != null) {
+        val cb = convertIfAny(nextBatch)
+        this.synchronized {
+          // Since we release the lock for some time, it is possible some batches
+          // have been added into the queue. Then we need to make sure this batch
+          // is the first one.
+          queue.insert(0, SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+        }
+      }
     }
-    queue.nonEmpty
-  }
 
-  /**
-   * Get the number of rows in the next batch, without actually getting the batch.
-   */
-  def peekBatchSize: Int = synchronized {
-    queue.head.numRows()
+    this.synchronized {
+      if (queue.nonEmpty) {
+        queue.head.numRows()
+      } else {
+        0 // Should not go here but just in case.
+      }
+    }
   }
 
   override def close(): Unit = synchronized {
-    if (!isSet) {
-      isSet = true
-      notifyAll()
-    }
-    while(queue.nonEmpty) {
-      queue.dequeue().close()
+    while (queue.nonEmpty) {
+      queue.remove(0).close()
     }
   }
 }
@@ -285,10 +366,7 @@ case class GpuArrowEvalPythonExec(
 
     val inputRDD = child.executeColumnar()
     inputRDD.mapPartitions { iter =>
-      val queue: BatchQueue = new BatchQueue()
       val context = TaskContext.get()
-      onTaskCompletion(context)(queue.close())
-
       val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip
 
       // Not sure why we are doing this in every task.  It is not going to change, but it might
@@ -318,13 +396,21 @@ case class GpuArrowEvalPythonExec(
       }.toArray)
 
       val boundReferences = GpuBindReferences.bindReferences(allInputs.toSeq, childOutput)
-      val batchedIterator = new RebatchingRoundoffIterator(iter, inputSchema, targetBatchSize,
-        numInputRows, numInputBatches)
-      val pyInputIterator = batchedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new RebatchingRoundoffIterator(iter, inputSchema, targetBatchSize, numInputRows,
+          numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val ret = GpuProjectExec.project(batch, boundReferences)
-        queue.add(batch)
+        val ret = closeOnExcept(batch)(GpuProjectExec.project(_, boundReferences))
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
+        }
         ret
       }
 
@@ -342,8 +428,7 @@ case class GpuArrowEvalPythonExec(
           timeZone,
           runnerConf,
           targetBatchSize,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
index b323ac62843..7eb1803bf17 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
@@ -29,10 +29,11 @@ import org.apache.arrow.vector.ipc.ArrowStreamWriter
 
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python._
+import org.apache.spark.internal.Logging
 import org.apache.spark.rapids.shims.api.python.ShimBasePythonRunner
 import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.execution.python.shims.{GpuArrowPythonRunner, GpuPythonArrowOutput}
+import org.apache.spark.sql.rapids.execution.python.shims.GpuPythonArrowOutput
 import org.apache.spark.sql.rapids.shims.ArrowUtilsShim
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.ArrowUtils
@@ -100,8 +101,7 @@ abstract class GpuArrowPythonRunnerBase(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuPythonRunnerBase[ColumnarBatch](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
 
@@ -119,10 +119,12 @@ abstract class GpuArrowPythonRunnerBase(
       env: SparkEnv,
       inputIterator: Iterator[ColumnarBatch],
       partitionIndex: Int,
-      context: TaskContext) {
+      context: TaskContext) extends Logging {
 
-    def writeCommand(dataOut: DataOutputStream): Unit = {
+    private[this] var tableWriter: TableWriter = _
+    private[this] lazy val isInputNonEmpty = inputIterator.nonEmpty
 
+    def writeCommand(dataOut: DataOutputStream): Unit = {
       // Write config for the worker as a number of key -> value pairs of strings
       dataOut.writeInt(conf.size)
       for ((k, v) <- conf) {
@@ -133,25 +135,67 @@ abstract class GpuArrowPythonRunnerBase(
       PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
     }
 
-    def writeInputToStream(dataOut: DataOutputStream): Boolean = {
-      if (inputIterator.nonEmpty) {
-        writeNonEmptyIteratorOnGpu(dataOut)
-      } else { // Partition is empty.
-        // In this case CPU will still send the schema to Python workers by calling
-        // the "start" API of the Java Arrow writer, but GPU will send out nothing,
-        // leading to the IPC error. And it is not easy to do as what Spark does on
-        // GPU, because the C++ Arrow writer used by GPU will only send out the schema
-        // iff there is some data. Besides, it does not expose a "start" API to do this.
-        // So here we leverage the Java Arrow writer to do similar things as Spark.
-        // It is OK because sending out schema has nothing to do with GPU.
+    /**
+     * Write all the batches into stream in one time for two-threaded PythonRunner.
+     * This will be called only once.
+     */
+    def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+      if (isInputNonEmpty) {
+        initTableWriter(dataOut)
+        logDebug("GpuPythonRunner starts to write all batches to the stream.")
+        Utils.tryWithSafeFinally {
+          while (inputIterator.hasNext) {
+            writeBatchToStreamAndClose(inputIterator.next())
+          }
+        } {
+          dataOut.flush()
+          close()
+        }
+      } else {
+        logDebug("GpuPythonRunner writes nothing to stream because the input is empty.")
         writeEmptyIteratorOnCpu(dataOut)
-        // Returning false because nothing was written
+        // The iterator can grab the semaphore even on an empty batch
+        GpuSemaphore.releaseIfNecessary(TaskContext.get())
+      }
+      logDebug("GpuPythonRunner writing is done.")
+    }
+
+    /**
+     * Write one batch each time for the singled-threaded PythonRunner.
+     * This will be called multiple times when returning a true.
+     * See https://issues.apache.org/jira/browse/SPARK-44705
+     */
+    def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
+      if (isInputNonEmpty) {
+        initTableWriter(dataOut)
+        try {
+          if (inputIterator.hasNext) {
+            logDebug("GpuPythonRunner[single-threaded] write a batch to the stream.")
+            writeBatchToStreamAndClose(inputIterator.next())
+            dataOut.flush()
+            true
+          } else { // all batches are written, close the writer
+            logDebug("GpuPythonRunner[single-threaded] writing is done.")
+            close()
+            false
+          }
+        } catch {
+          case t: Throwable =>
+            close()
+            throw t
+        }
+      } else {
+        logDebug("GpuPythonRunner[single-threaded] writes nothing to stream because" +
+          " the input is empty.")
+        writeEmptyIteratorOnCpu(dataOut)
+        // The iterator can grab the semaphore even on an empty batch
+        GpuSemaphore.releaseIfNecessary(TaskContext.get())
         false
       }
     }
 
-    private def writeNonEmptyIteratorOnGpu(dataOut: DataOutputStream): Boolean = {
-      val writer = {
+    private def initTableWriter(dataOut: DataOutputStream): Unit = {
+      if (tableWriter == null) {
         val builder = ArrowIPCWriterOptions.builder()
         builder.withMaxChunkSize(batchSize)
         builder.withCallback((table: Table) => {
@@ -159,40 +203,44 @@ abstract class GpuArrowPythonRunnerBase(
           GpuSemaphore.releaseIfNecessary(TaskContext.get())
         })
         // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-        GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+        GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
           if (nullable) {
             builder.withColumnNames(name)
           } else {
             builder.withNotNullableColumnNames(name)
           }
         }
-        Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
+        tableWriter =
+          Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
       }
+    }
 
-      var wrote = false
-      Utils.tryWithSafeFinally {
-        while (inputIterator.hasNext) {
-          wrote = false
-          val table = withResource(inputIterator.next()) { nextBatch =>
-            GpuColumnVector.from(nextBatch)
-          }
-          withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
-            // The callback will handle closing table and releasing the semaphore
-            writer.write(table)
-            wrote = true
-          }
-        }
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
-      } {
-        writer.close()
-        dataOut.flush()
-        if (onDataWriteFinished != null) onDataWriteFinished()
+    private def writeBatchToStreamAndClose(batch: ColumnarBatch): Unit = {
+      val table = withResource(batch) { nextBatch =>
+        GpuColumnVector.from(nextBatch)
+      }
+      withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
+        // The callback will handle closing table and releasing the semaphore
+        tableWriter.write(table)
+      }
+    }
+
+    private def close(): Unit = {
+      if (tableWriter != null) {
+        tableWriter.close()
+        tableWriter = null
       }
-      wrote
     }
 
     private def writeEmptyIteratorOnCpu(dataOut: DataOutputStream): Unit = {
+      // For the case that partition is empty.
+      // In this case CPU will still send the schema to Python workers by calling
+      // the "start" API of the Java Arrow writer, but GPU will send out nothing,
+      // leading to the IPC error. And it is not easy to do as what Spark does on
+      // GPU, because the C++ Arrow writer used by GPU will only send out the schema
+      // iff there is some data. Besides, it does not expose a "start" API to do this.
+      // So here we leverage the Java Arrow writer to do similar things as Spark.
+      // It is OK because sending out schema has nothing to do with GPU.
       // most code is copied from Spark
       val arrowSchema = ArrowUtilsShim.toArrowSchema(pythonInSchema, timeZoneId)
       val allocator = ArrowUtils.rootAllocator.newChildAllocator(
@@ -204,13 +252,22 @@ abstract class GpuArrowPythonRunnerBase(
         writer.start()
         // No data to write
         writer.end()
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
       } {
         root.close()
         allocator.close()
-        if (onDataWriteFinished != null) onDataWriteFinished()
       }
     }
   }
 }
+
+object GpuPythonRunnerUtils {
+  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
+    d match {
+      case s: StructType =>
+        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
+      case m: MapType =>
+        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
+      case a: ArrayType => flattenNames(a.elementType, nullable)
+      case _ => Nil
+    }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
index 66c18011a4e..12e2258aaaf 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import ai.rapids.cudf
 import ai.rapids.cudf.{GroupByAggregation, NullPolicy, OrderByArg}
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -499,23 +499,29 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
     // 8) Start processing.
     child.executeColumnar().mapPartitions { inputIter =>
       val context = TaskContext.get()
-      val queue: BatchQueue = new BatchQueue()
-      onTaskCompletion(context)(queue.close())
 
       val boundDataRefs = GpuBindReferences.bindGpuReferences(dataInputs.toSeq, childOutput)
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
-      val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches)
-      val pyInputIterator = groupedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new GroupingIterator(inputIter, boundPartitionRefs, numInputRows, numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val projectedBatch = GpuProjectExec.project(batch, boundDataRefs)
-        // Compute the window bounds and insert to the head of each row for one batch
-        val inputBatch = withResource(projectedBatch) { projectedCb =>
-          insertWindowBounds(projectedCb)
+        val inputBatch = closeOnExcept(batch) { _ =>
+          withResource(GpuProjectExec.project(batch, boundDataRefs)) { projectedCb =>
+            // Compute the window bounds and insert to the head of each row for one batch
+            insertWindowBounds(projectedCb)
+          }
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
         }
-        queue.add(batch)
         inputBatch
       }
 
@@ -534,12 +540,11 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
-          numOutputBatches).map(projectResult(_))
+          numOutputBatches).map(projectResult)
       } else {
         // Empty partition, return the input iterator directly
         inputIter
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
index 1151ad55d8f..0dd0274cec8 100644
--- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
@@ -35,7 +35,6 @@
 {"spark": "333"}
 {"spark": "340"}
 {"spark": "341"}
-{"spark": "341db"}
 spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
index 7757a0c3582..d5e779011c0 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
@@ -119,7 +119,7 @@ class GpuCoGroupedArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(batchSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(batchSchema).foreach { case (name, nullable) =>
             if (nullable) {
               builder.withColumnNames(name)
             } else {
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
index 681cdd3b11c..e4685e48e06 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
@@ -62,15 +62,15 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
 
   /**
    * Default to `Int.MaxValue` to try to read as many as possible.
-   * Change it by calling `setMinReadTargetBatchSize` before a reading.
+   * Change it by calling `setMinReadTargetNumRows` before a reading.
    */
-  private var minReadTargetBatchSize: Int = Int.MaxValue
+  private var minReadTargetNumRows: Int = Int.MaxValue
 
   /**
    * Update the expected batch size for next reading.
    */
-  private[python] final def setMinReadTargetBatchSize(size: Int): Unit = {
-    minReadTargetBatchSize = size
+  private[python] final def setMinReadTargetNumRows(numRows: Int): Unit = {
+    minReadTargetNumRows = numRows
   }
 
   /** Convert the table received from the Python side to a batch. */
@@ -128,7 +128,7 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
             // The GpuSemaphore is acquired in a callback
             val table =
               withResource(new NvtxRange("read python batch", NvtxColor.DARK_GREEN)) { _ =>
-                arrowReader.getNextIfAvailable(minReadTargetBatchSize)
+                arrowReader.getNextIfAvailable(minReadTargetNumRows)
               }
             if (table == null) {
               batchLoaded = false
@@ -177,10 +177,9 @@ class GpuArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuArrowPythonRunnerBase(funcs, evalType, argOffsets, pythonInSchema, timeZoneId,
-    conf, batchSize, pythonOutSchema, onDataWriteFinished) {
+    conf, batchSize, pythonOutSchema) {
 
   protected override def newWriterThread(
       env: SparkEnv,
@@ -197,20 +196,8 @@ class GpuArrowPythonRunner(
       }
 
       protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
-        workerImpl.writeInputToStream(dataOut)
+        workerImpl.writeIteratorToStream(dataOut)
       }
     }
   }
 }
-
-object GpuArrowPythonRunner {
-  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
-    d match {
-      case s: StructType =>
-        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
-      case m: MapType =>
-        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
-      case a: ArrayType => flattenNames(a.elementType, nullable)
-      case _ => Nil
-    }
-}
diff --git a/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
index 86d34414991..3d0d3450320 100644
--- a/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
@@ -25,8 +25,7 @@ package com.nvidia.spark.rapids.shims
 import scala.collection.mutable.ArrayBuffer
 
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 
 import org.apache.spark.TaskContext
@@ -34,7 +33,7 @@ import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.rapids.execution.python.{BatchQueue, CombiningIterator, GpuPythonHelper, GpuPythonUDF, GpuWindowInPandasExecBase, GroupingIterator}
+import org.apache.spark.sql.rapids.execution.python.{BatchProducer, BatchQueue, CombiningIterator, GpuPythonHelper, GpuWindowInPandasExecBase, GroupingIterator}
 import org.apache.spark.sql.rapids.execution.python.shims.GpuArrowPythonRunner
 import org.apache.spark.sql.rapids.shims.{ArrowUtilsShim, DataTypeUtilsShim}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
@@ -111,9 +110,7 @@ case class GpuWindowInPandasExec(
 
     // 2) Extract window functions, here should be Python (Pandas) UDFs
     val allWindowExpressions = expressionsWithFrameIndex.map(_._1)
-    val udfExpressions = allWindowExpressions.map {
-      case e: GpuWindowExpression => e.windowFunction.asInstanceOf[GpuPythonUDF]
-    }
+    val udfExpressions = PythonUDFShim.getUDFExpressions(allWindowExpressions)
     // We shouldn't be chaining anything here.
     // All chained python functions should only contain one function.
     val (pyFuncs, inputs) = udfExpressions.map(collectFunctions).unzip
@@ -196,23 +193,29 @@ case class GpuWindowInPandasExec(
     // 8) Start processing.
     child.executeColumnar().mapPartitions { inputIter =>
       val context = TaskContext.get()
-      val queue: BatchQueue = new BatchQueue()
-      onTaskCompletion(context)(queue.close())
 
       val boundDataRefs = GpuBindReferences.bindGpuReferences(dataInputs, childOutput)
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
-      val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches)
-      val pyInputIterator = groupedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new GroupingIterator(inputIter, boundPartitionRefs, numInputRows, numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val projectedBatch = GpuProjectExec.project(batch, boundDataRefs)
-        // Compute the window bounds and insert to the head of each row for one batch
-        val inputBatch = withResource(projectedBatch) { projectedCb =>
-          insertWindowBounds(projectedCb)
+        val inputBatch = closeOnExcept(batch) { _ =>
+          withResource(GpuProjectExec.project(batch, boundDataRefs)) { projectedCb =>
+            // Compute the window bounds and insert to the head of each row for one batch
+            insertWindowBounds(projectedCb)
+          }
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
         }
-        queue.add(batch)
         inputBatch
       }
 
@@ -231,12 +234,11 @@ case class GpuWindowInPandasExec(
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
-          numOutputBatches).map(projectResult(_))
+          numOutputBatches).map(projectResult)
       } else {
         // Empty partition, return the input iterator directly
         inputIter
diff --git a/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
index e905e0687cd..7eb2bb74b0e 100644
--- a/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
@@ -34,7 +34,7 @@ import com.nvidia.spark.rapids.Arm.withResource
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python._
 import org.apache.spark.sql.execution.python.PythonUDFRunner
-import org.apache.spark.sql.rapids.execution.python.{BufferToStreamWriter, GpuPythonRunnerBase}
+import org.apache.spark.sql.rapids.execution.python.{BufferToStreamWriter, GpuPythonRunnerBase, GpuPythonRunnerUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.Utils
@@ -94,7 +94,7 @@ class GpuGroupUDFArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
               if (nullable) {
                 builder.withColumnNames(name)
               } else {
diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
index 6018f5e51b1..36ffc1db926 100644
--- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
+++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
 import org.apache.spark.sql.execution.{CollectLimitExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec}
 import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS
 import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null
+import org.apache.spark.sql.rapids.execution.python.GpuPythonUDAF
 import org.apache.spark.sql.types.StringType
 
 trait Spark341PlusDBShims extends Spark332PlusDBShims {
@@ -56,7 +57,35 @@ trait Spark341PlusDBShims extends Spark332PlusDBShims {
         (a, conf, p, r) => new UnaryExprMeta[Empty2Null](a, conf, p, r) {
           override def convertToGpu(child: Expression): GpuExpression = GpuEmpty2Null(child)
         }
-      )
+      ),
+      GpuOverrides.expr[PythonUDAF](
+        "UDF run in an external python process. Does not actually run on the GPU, but " +
+          "the transfer of data to/from it can be accelerated",
+        ExprChecks.fullAggAndProject(
+          // Different types of Pandas UDF support different sets of output type. Please refer to
+          //   https://github.com/apache/spark/blob/master/python/pyspark/sql/udf.py#L98
+          // for more details.
+          // It is impossible to specify the exact type signature for each Pandas UDF type in a
+          // single expression 'PythonUDF'.
+          // So use the 'unionOfPandasUdfOut' to cover all types for Spark. The type signature of
+          // plugin is also an union of all the types of Pandas UDF.
+          (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested() + TypeSig.STRUCT,
+          TypeSig.unionOfPandasUdfOut,
+          repeatingParamCheck = Some(RepeatingParamCheck(
+            "param",
+            (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(),
+            TypeSig.all))),
+        (a, conf, p, r) => new ExprMeta[PythonUDAF](a, conf, p, r) {
+          override def replaceMessage: String = "not block GPU acceleration"
+
+          override def noReplacementPossibleMessage(reasons: String): String =
+            s"blocks running on GPU because $reasons"
+
+          override def convertToGpu(): GpuExpression =
+            GpuPythonUDAF(a.name, a.func, a.dataType,
+              childExprs.map(_.convertToGpu()),
+              a.evalType, a.udfDeterministic, a.resultId)
+        })
     ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
     super.getExprs ++ shimExprs ++ DayTimeIntervalShims.exprs ++ RoundingShims.exprs
   }
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
index 9c245cf2636..a8aa799c484 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.util.Utils
 
 /**
  * Python UDF Runner for cogrouped UDFs, designed for `GpuFlatMapCoGroupsInPandasExec` only.
@@ -40,24 +39,24 @@ import org.apache.spark.util.Utils
  * and receive it back in JVM as batches of single DataFrame.
  */
 class GpuCoGroupedArrowPythonRunner(
-                                     funcs: Seq[ChainedPythonFunctions],
-                                     evalType: Int,
-                                     argOffsets: Array[Array[Int]],
-                                     leftSchema: StructType,
-                                     rightSchema: StructType,
-                                     timeZoneId: String,
-                                     conf: Map[String, String],
-                                     batchSize: Int,
-                                     pythonOutSchema: StructType)
+    funcs: Seq[ChainedPythonFunctions],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    leftSchema: StructType,
+    rightSchema: StructType,
+    timeZoneId: String,
+    conf: Map[String, String],
+    batchSize: Int,
+    pythonOutSchema: StructType)
   extends GpuPythonRunnerBase[(ColumnarBatch, ColumnarBatch)](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
 
   protected override def newWriter(
-                                    env: SparkEnv,
-                                    worker: PythonWorker,
-                                    inputIterator: Iterator[(ColumnarBatch, ColumnarBatch)],
-                                    partitionIndex: Int,
-                                    context: TaskContext): Writer = {
+      env: SparkEnv,
+      worker: PythonWorker,
+      inputIterator: Iterator[(ColumnarBatch, ColumnarBatch)],
+      partitionIndex: Int,
+      context: TaskContext): Writer = {
     new Writer(env, worker, inputIterator, partitionIndex, context) {
 
       protected override def writeCommand(dataOut: DataOutputStream): Unit = {
@@ -75,24 +74,24 @@ class GpuCoGroupedArrowPythonRunner(
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
         // For each we first send the number of dataframes in each group then send
         // first df, then send second df.  End of data is marked by sending 0.
-        var wrote = false
-        while (inputIterator.hasNext) {
-          wrote = false
+        if (inputIterator.hasNext) {
           dataOut.writeInt(2)
           val (leftGroupBatch, rightGroupBatch) = inputIterator.next()
           withResource(Seq(leftGroupBatch, rightGroupBatch)) { _ =>
-            wrote = writeGroupBatch(leftGroupBatch, leftSchema, dataOut)
-            wrote = writeGroupBatch(rightGroupBatch, rightSchema, dataOut)
+            writeGroupBatch(leftGroupBatch, leftSchema, dataOut)
+            writeGroupBatch(rightGroupBatch, rightSchema, dataOut)
           }
+          true
+        } else {
+          // The iterator can grab the semaphore even on an empty batch
+          GpuSemaphore.releaseIfNecessary(TaskContext.get())
+          dataOut.writeInt(0)
+          false
         }
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
-        dataOut.writeInt(0)
-        wrote
       }
 
       private def writeGroupBatch(groupBatch: ColumnarBatch, batchSchema: StructType,
-                                  dataOut: DataOutputStream): Boolean = {
+          dataOut: DataOutputStream): Unit = {
         val writer = {
           val builder = ArrowIPCWriterOptions.builder()
           builder.withMaxChunkSize(batchSize)
@@ -101,7 +100,7 @@ class GpuCoGroupedArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(batchSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(batchSchema).foreach { case (name, nullable) =>
             if (nullable) {
               builder.withColumnNames(name)
             } else {
@@ -110,18 +109,20 @@ class GpuCoGroupedArrowPythonRunner(
           }
           Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
         }
-        var wrote = false
-        Utils.tryWithSafeFinally {
+        try {
           withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
             // The callback will handle closing table and releasing the semaphore
             writer.write(GpuColumnVector.from(groupBatch))
-            wrote = true
           }
-        } {
+        } catch {
+          case t: Throwable =>
+            // release the semaphore in case of exception in the middle of writing a batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            throw t
+        } finally {
           writer.close()
           dataOut.flush()
         }
-        wrote
       } // end of writeGroup
     }
   } // end of newWriterThread
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
index c1aea19a194..a9c04808879 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.util.Utils
 
 /**
  * Group Map UDF specific serializer for Databricks because they have a special GroupUDFSerializer.
@@ -82,25 +81,23 @@ class GpuGroupUDFArrowPythonRunner(
       }
 
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
-        var wrote = false
         // write out number of columns
-        Utils.tryWithSafeFinally {
-          val builder = ArrowIPCWriterOptions.builder()
-          builder.withMaxChunkSize(batchSize)
-          builder.withCallback((table: Table) => {
-            table.close()
-            GpuSemaphore.releaseIfNecessary(TaskContext.get())
-          })
-          // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+        try {
+          if (inputIterator.hasNext) {
+            val builder = ArrowIPCWriterOptions.builder()
+            builder.withMaxChunkSize(batchSize)
+            builder.withCallback((table: Table) => {
+              table.close()
+              GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            })
+            // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
+            GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
               if (nullable) {
                 builder.withColumnNames(name)
               } else {
                 builder.withNotNullableColumnNames(name)
               }
-          }
-          while(inputIterator.hasNext) {
-            wrote = false
+            }
             val writer = {
               // write 1 out to indicate there is more to read
               dataOut.writeInt(1)
@@ -112,20 +109,24 @@ class GpuGroupUDFArrowPythonRunner(
             withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
               // The callback will handle closing table and releasing the semaphore
               writer.write(table)
-              wrote = true
             }
             writer.close()
             dataOut.flush()
+            true
+          } else {
+            // The iterator can grab the semaphore even on an empty batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            // tell serializer we are done
+            dataOut.writeInt(0)
+            dataOut.flush()
+            false
           }
-          // indicate not to read more
-          // The iterator can grab the semaphore even on an empty batch
-          GpuSemaphore.releaseIfNecessary(TaskContext.get())
-        } {
-          // tell serializer we are done
-          dataOut.writeInt(0)
-          dataOut.flush()
+        } catch {
+          case t: Throwable =>
+            // release the semaphore in case of exception in the middle of writing a batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            throw t
         }
-        wrote
       }
     }
   }
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
index 35fe8979d94..ab11083561a 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
@@ -40,15 +40,15 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
 
   /**
    * Default to `Int.MaxValue` to try to read as many as possible.
-   * Change it by calling `setMinReadTargetBatchSize` before a reading.
+   * Change it by calling `setMinReadTargetNumRows` before a reading.
    */
-  private var minReadTargetBatchSize: Int = Int.MaxValue
+  private var minReadTargetNumRows: Int = Int.MaxValue
 
   /**
    * Update the expected batch size for next reading.
    */
-  private[python] final def setMinReadTargetBatchSize(size: Int): Unit = {
-    minReadTargetBatchSize = size
+  private[python] final def setMinReadTargetNumRows(numRows: Int): Unit = {
+    minReadTargetNumRows = numRows
   }
 
   /** Convert the table received from the Python side to a batch. */
@@ -106,7 +106,7 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
             // The GpuSemaphore is acquired in a callback
             val table =
               withResource(new NvtxRange("read python batch", NvtxColor.DARK_GREEN)) { _ =>
-                arrowReader.getNextIfAvailable(minReadTargetBatchSize)
+                arrowReader.getNextIfAvailable(minReadTargetNumRows)
               }
             if (table == null) {
               batchLoaded = false
@@ -155,10 +155,9 @@ class GpuArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuArrowPythonRunnerBase(funcs, evalType, argOffsets, pythonInSchema, timeZoneId,
-    conf, batchSize, pythonOutSchema, onDataWriteFinished) {
+    conf, batchSize, pythonOutSchema) {
 
   protected override def newWriter(
       env: SparkEnv,
@@ -175,21 +174,8 @@ class GpuArrowPythonRunner(
       }
 
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
-        workerImpl.writeInputToStream(dataOut)
+        workerImpl.writeNextInputToStream(dataOut)
       }
     }
   }
 }
-
-
-object GpuArrowPythonRunner {
-  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
-    d match {
-      case s: StructType =>
-        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
-      case m: MapType =>
-        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
-      case a: ArrayType => flattenNames(a.elementType, nullable)
-      case _ => Nil
-    }
-}

From b92a0b446533392f820f2017d709160b9428c451 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Thu, 30 Nov 2023 10:13:41 +0800
Subject: [PATCH 07/15] Update nightly build and deploy script for arm
 artifacts [skip ci] (#9888)

* Refactor deploy to support build and deploy arm64 artifacts

Signed-off-by: Peixin Li <pxli@nyu.edu>

* test only

* reset test code and update

* address comment

---------

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 jenkins/deploy.sh              |  9 +++++----
 jenkins/spark-nightly-build.sh | 29 ++++++++++++++++++++++-------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
index a85c8618a97..16428e121dc 100755
--- a/jenkins/deploy.sh
+++ b/jenkins/deploy.sh
@@ -30,6 +30,7 @@
 #   POM_FILE:       Project pom file to be deployed
 #   OUT_PATH:       The path where jar files are
 #   CUDA_CLASSIFIERS:    Comma separated classifiers, e.g., "cuda11,cuda12"
+#   CLASSIFIERS:    Comma separated classifiers, e.g., "cuda11,cuda12,cuda11-arm64,cuda12-arm64"
 ###
 
 set -ex
@@ -48,6 +49,7 @@ ART_GROUP_ID=$(mvnEval $DIST_PL project.groupId)
 ART_VER=$(mvnEval $DIST_PL project.version)
 DEFAULT_CUDA_CLASSIFIER=$(mvnEval $DIST_PL cuda.version)
 CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"}
+CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"} # default as CUDA_CLASSIFIERS for compatibility
 
 SQL_PL=${SQL_PL:-"sql-plugin"}
 POM_FILE=${POM_FILE:-"$DIST_PL/target/parallel-world/META-INF/maven/${ART_GROUP_ID}/${ART_ID}/pom.xml"}
@@ -57,9 +59,8 @@ SIGN_TOOL=${SIGN_TOOL:-"gpg"}
 FPATH="$OUT_PATH/$ART_ID-$ART_VER"
 DEPLOY_TYPES=''
 DEPLOY_FILES=''
-IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS"
-DEPLOY_TYPES=$(echo $CUDA_CLASSIFIERS | sed -e 's;[^,]*;jar;g')
-DEPLOY_FILES=$(echo $CUDA_CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g")
+DEPLOY_TYPES=$(echo $CLASSIFIERS | sed -e 's;[^,]*;jar;g')
+DEPLOY_FILES=$(echo $CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g")
 
 # dist does not have javadoc and sources jars, use 'sql-plugin' instead
 source jenkins/version-def.sh >/dev/null 2&>1
@@ -103,4 +104,4 @@ $DEPLOY_CMD -DpomFile=$POM_FILE \
             -Djavadoc=$FPATH-javadoc.jar \
             -Dfiles=$DEPLOY_FILES \
             -Dtypes=$DEPLOY_TYPES \
-            -Dclassifiers=$CUDA_CLASSIFIERS
+            -Dclassifiers=$CLASSIFIERS
diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh
index 7f391db6957..5d331686659 100755
--- a/jenkins/spark-nightly-build.sh
+++ b/jenkins/spark-nightly-build.sh
@@ -42,7 +42,8 @@ ART_GROUP_ID=$(mvnEval project.groupId)
 ART_VER=$(mvnEval project.version)
 DEFAULT_CUDA_CLASSIFIER=${DEFAULT_CUDA_CLASSIFIER:-$(mvnEval cuda.version)} # default cuda version
 CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"} # e.g. cuda11,cuda12
-IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS"
+CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"}  # default as CUDA_CLASSIFIERS for compatibility
+IFS=',' read -a CLASSIFIERS_ARR <<< "$CLASSIFIERS"
 TMP_PATH="/tmp/$(date '+%Y-%m-%d')-$$"
 
 DIST_FPATH="$DIST_PL/target/$ART_ID-$ART_VER-$DEFAULT_CUDA_CLASSIFIER"
@@ -72,7 +73,7 @@ function distWithReducedPom {
 
         deploy)
             mvnCmd="deploy:deploy-file"
-            if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then
+            if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then
               # try move tmp artifacts back to target folder for simplifying separate release process
               mv ${TMP_PATH}/${ART_ID}-${ART_VER}-*.jar ${DIST_PL}/target/
             fi
@@ -102,6 +103,11 @@ function distWithReducedPom {
 
 # option to skip unit tests. Used in our CI to separate test runs in parallel stages
 SKIP_TESTS=${SKIP_TESTS:-"false"}
+if [[ "${SKIP_TESTS}" == "true" ]]; then
+  # if skip test, we could try speed up build with multiple-threads
+  MVN="${MVN} -T1C"
+fi
+
 set +H # turn off history expansion
 DEPLOY_SUBMODULES=${DEPLOY_SUBMODULES:-"!${DIST_PL}"} # TODO: deploy only required submodules to save time
 for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
@@ -129,25 +135,34 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
 done
 
 installDistArtifact() {
-  local cuda_classifier="$1"
+  local cuda_version="$1"
+  local opt="$2"
   $MVN -B clean install \
+      $opt \
       $DIST_PROFILE_OPT \
       -Dbuildver=$SPARK_BASE_SHIM_VERSION \
       $MVN_URM_MIRROR \
       -Dmaven.repo.local=$M2DIR \
-      -Dcuda.version=$cuda_classifier \
+      -Dcuda.version=$cuda_version \
       -DskipTests=$SKIP_TESTS
 }
 
 # build extra cuda classifiers
-if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then
+if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then
   mkdir -p ${TMP_PATH}
-  for classifier in "${CUDA_CLASSIFIERS_ARR[@]}"; do
+  for classifier in "${CLASSIFIERS_ARR[@]}"; do
     if [ "${classifier}" == "${DEFAULT_CUDA_CLASSIFIER}" ]; then
       echo "skip default: ${DEFAULT_CUDA_CLASSIFIER} in build extra cuda classifiers step..."
       continue
     fi
-    installDistArtifact ${classifier}
+
+    opt=""
+    if [[ "${classifier}" == *"-arm64" ]]; then
+      opt="-Parm64"
+    fi
+    # pass cuda version and extra opt
+    installDistArtifact ${classifier%%-*} ${opt}
+
     # move artifacts to temp for deployment later
     artifactFile="${ART_ID}-${ART_VER}-${classifier}.jar"
     mv ${DIST_PL}/target/${artifactFile} ${TMP_PATH}/

From 7c653bf44b24da4e3a16dc8a124c6dd06bc0ece2 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 30 Nov 2023 11:57:43 +0800
Subject: [PATCH 08/15] Fix test_cast_string_ts_valid_format test (#9889)

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 integration_tests/src/main/python/cast_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py
index 61dad6412e1..beb3111383b 100644
--- a/integration_tests/src/main/python/cast_test.py
+++ b/integration_tests/src/main/python/cast_test.py
@@ -146,10 +146,9 @@ def test_cast_string_date_non_ansi():
         lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
         conf={'spark.rapids.sql.hasExtendedYearValues': 'false'})
 
-@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9708')
 @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'),
                                       StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
-                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')],
+                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')],
                         ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
 def test_cast_string_ts_valid_format(data_gen):

From b3f1e4744bcee969505cf079a79cd1c308527a97 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Thu, 30 Nov 2023 06:25:21 +0100
Subject: [PATCH 09/15] Revert "Remove Databricks 13.3 from release 23.12
 [databricks] (#9890)" [databricks] (#9900)

* Revert "Remove Databricks 13.3 from release 23.12 [databricks] (#9890)"

This reverts commit c59b0a2c0712b42fb4c0d169608e1a967e7b9147.

* Signing off

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

---------

Signed-off-by: Raza Jafri <rjafri@nvidia.com>
---
 jenkins/Jenkinsfile-blossom.premerge-databricks | 2 +-
 jenkins/databricks/build.sh                     | 6 ------
 pom.xml                                         | 3 ++-
 scala2.13/pom.xml                               | 3 ++-
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index 86fff7f23be..2fd2df7a8b0 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -88,7 +88,7 @@ pipeline {
                         // 'name' and 'value' only supprt literal string in the declarative Jenkins
                         // Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127
                         name 'DB_RUNTIME'
-                        values '10.4', '11.3', '12.2'
+                        values '10.4', '11.3', '12.2', '13.3'
                     }
                 }
                 stages {
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index a68b272257b..8a0b25a0c95 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -144,12 +144,6 @@ if [[ "$WITH_BLOOP" == "1" ]]; then
     MVN_OPT="ch.epfl.scala:bloop-maven-plugin:bloopInstall $MVN_OPT"
 fi
 
-# Disabling build for 341db until 24.02
-if [[ "$BUILDVER" == "341db" ]]; then
-    echo "Databricks 341 is not supported as of release 23.12\n"
-    exit 1
-fi 
-
 # Build the RAPIDS plugin by running package command for databricks
 $MVN_CMD -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT
 
diff --git a/pom.xml b/pom.xml
index d215f1c102b..744e9dd2985 100644
--- a/pom.xml
+++ b/pom.xml
@@ -771,7 +771,8 @@
         <databricks.buildvers>
             321db,
             330db,
-            332db
+            332db,
+            341db
         </databricks.buildvers>
         <!--
           Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 0206ced352b..7b09cf61707 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -771,7 +771,8 @@
         <databricks.buildvers>
             321db,
             330db,
-            332db
+            332db,
+            341db
         </databricks.buildvers>
         <!--
           Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)

From 55200f0d1c5b7aef631f8af3c791a40b24b05e76 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 30 Nov 2023 12:54:29 -0800
Subject: [PATCH 10/15] Change test

---
 integration_tests/src/main/python/cmp_test.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index a891b667016..cebd8c3e4ac 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -346,10 +346,14 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
+# We avoid testing with NaN since inset in Spark has issue with NaN comparision.
+# See https://github.com/NVIDIA/spark-rapids/issues/9687.
+eq_gens_with_decimal_gen_no_nans = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
+                                   [FloatGen(no_nans=True), DoubleGen(no_nans=True)]
+
 # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf
 # This is to test entries over that value.
-@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687')
-@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn)
+@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen_no_nans, ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
 def test_in_set(data_gen):
     # nulls are not supported for in on the GPU yet

From eb68a994a5ad8c2a8f59e68cf54d178a51a547e5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 30 Nov 2023 14:16:50 -0800
Subject: [PATCH 11/15] Test with NaN from Spark 3.2

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 integration_tests/src/main/python/cmp_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index cebd8c3e4ac..29e35fdff74 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -17,7 +17,7 @@
 from asserts import assert_gpu_and_cpu_are_equal_collect
 from conftest import is_not_utc
 from data_gen import *
-from spark_session import with_cpu_session, is_before_spark_330
+from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330
 from pyspark.sql.types import *
 from marks import datagen_overrides
 import pyspark.sql.functions as f
@@ -346,15 +346,16 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
-# We avoid testing with NaN since inset in Spark has issue with NaN comparision.
+# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparision.
 # See https://github.com/NVIDIA/spark-rapids/issues/9687.
-eq_gens_with_decimal_gen_no_nans = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
-                                   [FloatGen(no_nans=True), DoubleGen(no_nans=True)]
+test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
+                                   [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \
+                      if is_before_spark_320() else eq_gens_with_decimal_gen
 
 # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf
 # This is to test entries over that value.
-@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen_no_nans, ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
+@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn)
 def test_in_set(data_gen):
     # nulls are not supported for in on the GPU yet
     num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1

From 703c6e8637b98713380671a700cba0a3721c4d04 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 30 Nov 2023 14:40:48 -0800
Subject: [PATCH 12/15] Add docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 docs/compatibility.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 9d411f56d50..4de5da8993c 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -83,6 +83,19 @@ after Spark 3.1.0.
 We do not disable operations that produce different results due to `-0.0` in the data because it is
 considered to be a rare occurrence.
 
+### `NaN` vs `NaN`
+
+Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, All `NaN` are 
+considered as one unique value while other times they can be treated as different. The outcome of 
+`NaN` comparision can differ in various operations and also changes over time. Due to such complexity,
+our plugin cannot guarantee to always match its output with Apache Spark if there are `NaN` values 
+in the input.
+
+For example, the `inset` operator can treat `NaN` as different values in Spark 3.0.2 and 3.1.2 but 
+considers them as the same in Spark 3.0.4, 3.1.3, and from Spark 3.2.0 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). On the other hand, our plugin always compares all `NaN` as equal 
+value.
+
+
 ## Decimal Support
 
 Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits.

From b5cbe55e71c3842a431da7076496188d7bd6165d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 30 Nov 2023 14:48:17 -0800
Subject: [PATCH 13/15] Fix docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 docs/compatibility.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 4de5da8993c..05bb7ee00a2 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -85,15 +85,16 @@ considered to be a rare occurrence.
 
 ### `NaN` vs `NaN`
 
-Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, All `NaN` are 
-considered as one unique value while other times they can be treated as different. The outcome of 
+Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, All `NaN` are
+considered as one unique value while other times they can be treated as different. The outcome of
 `NaN` comparision can differ in various operations and also changes over time. Due to such complexity,
-our plugin cannot guarantee to always match its output with Apache Spark if there are `NaN` values 
+our plugin cannot guarantee to always match its output with Apache Spark if there are `NaN` values
 in the input.
 
-For example, the `inset` operator can treat `NaN` as different values in Spark 3.0.2 and 3.1.2 but 
-considers them as the same in Spark 3.0.4, 3.1.3, and from Spark 3.2.0 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). On the other hand, our plugin always compares all `NaN` as equal 
-value.
+For example, the `inset` operator can treat `NaN` as different values in Spark 3.0.2 and 3.1.2 but
+considers them as the same in Spark 3.0.4, 3.1.3, and from Spark 3.2.0
+(see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
+On the other hand, our plugin always compares all `NaN` as equal value.
 
 
 ## Decimal Support

From cbd6112758567dd860ff375396df68e56c7c4a35 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 30 Nov 2023 14:49:21 -0800
Subject: [PATCH 14/15] Fix typo

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 integration_tests/src/main/python/cmp_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index 29e35fdff74..0217bf0530f 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -346,7 +346,7 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
-# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparision.
+# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons.
 # See https://github.com/NVIDIA/spark-rapids/issues/9687.
 test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
                                    [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \

From a628087d79bae8d8ab422ed875b792fe9c83ce2f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 1 Dec 2023 14:11:53 -0800
Subject: [PATCH 15/15] Change docs

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 docs/compatibility.md | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 05bb7ee00a2..8043aa12d38 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -85,16 +85,13 @@ considered to be a rare occurrence.
 
 ### `NaN` vs `NaN`
 
-Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, All `NaN` are
+Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are
 considered as one unique value while other times they can be treated as different. The outcome of
-`NaN` comparision can differ in various operations and also changes over time. Due to such complexity,
-our plugin cannot guarantee to always match its output with Apache Spark if there are `NaN` values
-in the input.
-
-For example, the `inset` operator can treat `NaN` as different values in Spark 3.0.2 and 3.1.2 but
-considers them as the same in Spark 3.0.4, 3.1.3, and from Spark 3.2.0
-(see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
-On the other hand, our plugin always compares all `NaN` as equal value.
+`NaN` comparison can differ in various operations and also changed between Spark versions.
+Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below:
+ - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from
+Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
+On the other hand, our plugin always compares all `NaN` as equal value for this operation.
 
 
 ## Decimal Support