diff --git a/README.md b/README.md
index 46d846aa9c2..d669be64ca9 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <scope>provided</scope>
 </dependency>
 ```
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 4fa4827ac52..f3bdf9d256b 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/api_validation/pom.xml b/api_validation/pom.xml
index 34c2404c3c0..d85b6a68146 100644
--- a/api_validation/pom.xml
+++ b/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/datagen/README.md b/datagen/README.md
index f374e4da9f2..5fc3aa06de3 100644
--- a/datagen/README.md
+++ b/datagen/README.md
@@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0.
 
 After this the jar should be at
 `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar`
-for example a Spark 3.3.0 jar for the 23.12.0 release would be
-`target/datagen_2.12-23.12.0-spark330.jar`
+for example a Spark 3.3.0 jar for the 24.02.0 release would be
+`target/datagen_2.12-24.02.0-spark330.jar`
 
 To get a spark shell with this you can run
 ```shell
-spark-shell --jars target/datagen_2.12-23.12.0-spark330.jar
+spark-shell --jars target/datagen_2.12-24.02.0-spark330.jar
 ```
 
 After that you should be good to go.
diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md
index dc55ffd393a..fd2af1decbb 100644
--- a/datagen/ScaleTest.md
+++ b/datagen/ScaleTest.md
@@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME
-./target/datagen_2.12-23.12.0-SNAPSHOT-spark332.jar \
+./target/datagen_2.12-24.02.0-SNAPSHOT-spark332.jar \
 1 \
 10 \
 parquet \
diff --git a/datagen/pom.xml b/datagen/pom.xml
index d22f874bb04..6a6129ac603 100644
--- a/datagen/pom.xml
+++ b/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.12</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml
index 5cb0e2e2e4e..8a06a26a69c 100644
--- a/delta-lake/delta-20x/pom.xml
+++ b/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml
index 5b4e1225722..547cb52a9f8 100644
--- a/delta-lake/delta-21x/pom.xml
+++ b/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml
index 0b6d2175f2f..f52c3ab0f7c 100644
--- a/delta-lake/delta-22x/pom.xml
+++ b/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml
index 9b8cb489cb6..02372462348 100644
--- a/delta-lake/delta-23x/pom.xml
+++ b/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml
index 93f625397bf..b793ec7c393 100644
--- a/delta-lake/delta-24x/pom.xml
+++ b/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/delta-lake/delta-spark321db/pom.xml b/delta-lake/delta-spark321db/pom.xml
index 95f9146f51a..1514904d03f 100644
--- a/delta-lake/delta-spark321db/pom.xml
+++ b/delta-lake/delta-spark321db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark321db</rapids.module>
diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml
index c8ed34bd539..eef0b56b0ae 100644
--- a/delta-lake/delta-spark330db/pom.xml
+++ b/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml
index 1d81d63aa94..34335400b43 100644
--- a/delta-lake/delta-spark332db/pom.xml
+++ b/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml
index 64e920eb8f1..11291f9ad21 100644
--- a/delta-lake/delta-spark341db/pom.xml
+++ b/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
 	<groupId>com.nvidia</groupId>
 	<artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml
index c58eb185cfc..ac77668efda 100644
--- a/delta-lake/delta-stub/pom.xml
+++ b/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/dist/pom.xml b/dist/pom.xml
index 6fbc047ac47..01a60af8096 100644
--- a/dist/pom.xml
+++ b/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/docs/compatibility.md b/docs/compatibility.md
index 9d411f56d50..8043aa12d38 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -83,6 +83,17 @@ after Spark 3.1.0.
 We do not disable operations that produce different results due to `-0.0` in the data because it is
 considered to be a rare occurrence.
 
+### `NaN` vs `NaN`
+
+Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are
+considered as one unique value while other times they can be treated as different. The outcome of
+`NaN` comparison can differ in various operations and also changed between Spark versions.
+Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below:
+ - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from
+Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)).
+On the other hand, our plugin always compares all `NaN` as equal value for this operation.
+
+
 ## Decimal Support
 
 Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits.
diff --git a/docs/configs.md b/docs/configs.md
index 5a467ea9fa0..58d4e28d79d 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.concurrentGpuTasks=2
 ```
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index cca778382b8..e214d07862d 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
 Spark 3.0.2's URLs:
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark302/
 ```
 
 Spark 3.2.0's URLs :
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark320/
 ```
 
 ### Late Inheritance in Public Classes
diff --git a/integration_tests/README.md b/integration_tests/README.md
index af203f44ad9..11687baa2d8 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -254,7 +254,7 @@ individually, so you don't risk running unit tests along with the integration te
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell
-spark-shell --jars rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -277,7 +277,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -376,7 +376,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
 ```
 
 ### Enabling fuzz tests
diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md
index 774fb906cf6..3cf4b3a25d9 100644
--- a/integration_tests/ScaleTest.md
+++ b/integration_tests/ScaleTest.md
@@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \
 --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \
-./target/rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-spark332.jar \
+./target/rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-spark332.jar \
 10 \
 100 \
 parquet \
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index e5484e0fd49..21432f5161b 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.12</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py
index 61dad6412e1..beb3111383b 100644
--- a/integration_tests/src/main/python/cast_test.py
+++ b/integration_tests/src/main/python/cast_test.py
@@ -146,10 +146,9 @@ def test_cast_string_date_non_ansi():
         lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
         conf={'spark.rapids.sql.hasExtendedYearValues': 'false'})
 
-@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9708')
 @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'),
                                       StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
-                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')],
+                                      StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')],
                         ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
 def test_cast_string_ts_valid_format(data_gen):
diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py
index a891b667016..0217bf0530f 100644
--- a/integration_tests/src/main/python/cmp_test.py
+++ b/integration_tests/src/main/python/cmp_test.py
@@ -17,7 +17,7 @@
 from asserts import assert_gpu_and_cpu_are_equal_collect
 from conftest import is_not_utc
 from data_gen import *
-from spark_session import with_cpu_session, is_before_spark_330
+from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330
 from pyspark.sql.types import *
 from marks import datagen_overrides
 import pyspark.sql.functions as f
@@ -346,11 +346,16 @@ def test_in(data_gen):
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars)))
 
+# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons.
+# See https://github.com/NVIDIA/spark-rapids/issues/9687.
+test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \
+                                   [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \
+                      if is_before_spark_320() else eq_gens_with_decimal_gen
+
 # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf
 # This is to test entries over that value.
-@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687')
-@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn)
 @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653')
+@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn)
 def test_in_set(data_gen):
     # nulls are not supported for in on the GPU yet
     num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1
diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
index aa27503c8eb..50eaa7c49a9 100644
--- a/integration_tests/src/main/python/spark_session.py
+++ b/integration_tests/src/main/python/spark_session.py
@@ -158,6 +158,9 @@ def is_spark_330_or_later():
 def is_spark_340_or_later():
     return spark_version() >= "3.4.0"
 
+def is_spark_341():
+    return spark_version() == "3.4.1"
+
 def is_spark_350_or_later():
     return spark_version() >= "3.5.0"
 
diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py
index 88281279162..9e3f5d05bcc 100644
--- a/integration_tests/src/main/python/udf_test.py
+++ b/integration_tests/src/main/python/udf_test.py
@@ -15,7 +15,7 @@
 import pytest
 
 from conftest import is_at_least_precommit_run, is_not_utc
-from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later
+from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_341
 
 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version
 
@@ -43,12 +43,6 @@
 import pyarrow
 from typing import Iterator, Tuple
 
-
-if is_databricks_runtime() and is_spark_340_or_later():
-    # Databricks 13.3 does not use separate reader/writer threads for Python UDFs
-    # which can lead to hangs. Skipping these tests until the Python UDF handling is updated.
-    pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493")
-
 arrow_udf_conf = {
     'spark.sql.execution.arrow.pyspark.enabled': 'true',
     'spark.rapids.sql.exec.WindowInPandasExec': 'true',
@@ -182,7 +176,10 @@ def group_size_udf(to_process: pd.Series) -> int:
 
 low_upper_win = Window.partitionBy('a').orderBy('b').rowsBetween(-3, 3)
 
-udf_windows = [no_part_win, unbounded_win, cur_follow_win, pre_cur_win, low_upper_win]
+running_win_param = pytest.param(pre_cur_win, marks=pytest.mark.xfail(
+    condition=is_databricks_runtime() and is_spark_341(),
+    reason='DB13.3 wrongly uses RunningWindowFunctionExec to evaluate a PythonUDAF and it will fail even on CPU'))
+udf_windows = [no_part_win, unbounded_win, cur_follow_win, running_win_param, low_upper_win]
 window_ids = ['No_Partition', 'Unbounded', 'Unbounded_Following', 'Unbounded_Preceding',
               'Lower_Upper']
 
@@ -338,8 +335,8 @@ def create_df(spark, data_gen, left_length, right_length):
 @ignore_order
 @pytest.mark.parametrize('data_gen', [ShortGen(nullable=False)], ids=idfn)
 def test_cogroup_apply_udf(data_gen):
-    def asof_join(l, r):
-        return pd.merge_asof(l, r, on='a', by='b')
+    def asof_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+        return pd.merge_ordered(left, right)
 
     def do_it(spark):
         left, right = create_df(spark, data_gen, 500, 500)
diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml
index d9488a8259d..e9d913eb6c6 100644
--- a/jdk-profiles/pom.xml
+++ b/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk9plus</id>
diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index 86fff7f23be..2fd2df7a8b0 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -88,7 +88,7 @@ pipeline {
                         // 'name' and 'value' only supprt literal string in the declarative Jenkins
                         // Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127
                         name 'DB_RUNTIME'
-                        values '10.4', '11.3', '12.2'
+                        values '10.4', '11.3', '12.2', '13.3'
                     }
                 }
                 stages {
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index a68b272257b..8a0b25a0c95 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -144,12 +144,6 @@ if [[ "$WITH_BLOOP" == "1" ]]; then
     MVN_OPT="ch.epfl.scala:bloop-maven-plugin:bloopInstall $MVN_OPT"
 fi
 
-# Disabling build for 341db until 24.02
-if [[ "$BUILDVER" == "341db" ]]; then
-    echo "Databricks 341 is not supported as of release 23.12\n"
-    exit 1
-fi 
-
 # Build the RAPIDS plugin by running package command for databricks
 $MVN_CMD -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT
 
diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
index b0305f92112..289a114d230 100644
--- a/jenkins/databricks/create.py
+++ b/jenkins/databricks/create.py
@@ -27,7 +27,7 @@ def main():
   workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com'
   token = ''
   sshkey = ''
-  cluster_name = 'CI-GPU-databricks-23.12.0-SNAPSHOT'
+  cluster_name = 'CI-GPU-databricks-24.02.0-SNAPSHOT'
   idletime = 240
   runtime = '7.0.x-gpu-ml-scala2.12'
   num_workers = 1
diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
index 4af278e3a97..a0c7de590d7 100755
--- a/jenkins/databricks/init_cudf_udf.sh
+++ b/jenkins/databricks/init_cudf_udf.sh
@@ -20,7 +20,7 @@
 
 set -ex
 
-CUDF_VER=${CUDF_VER:-23.12}
+CUDF_VER=${CUDF_VER:-24.02}
 CUDA_VER=${CUDA_VER:-11.8}
 
 # Need to explicitly add conda into PATH environment, to activate conda environment.
diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
index a85c8618a97..16428e121dc 100755
--- a/jenkins/deploy.sh
+++ b/jenkins/deploy.sh
@@ -30,6 +30,7 @@
 #   POM_FILE:       Project pom file to be deployed
 #   OUT_PATH:       The path where jar files are
 #   CUDA_CLASSIFIERS:    Comma separated classifiers, e.g., "cuda11,cuda12"
+#   CLASSIFIERS:    Comma separated classifiers, e.g., "cuda11,cuda12,cuda11-arm64,cuda12-arm64"
 ###
 
 set -ex
@@ -48,6 +49,7 @@ ART_GROUP_ID=$(mvnEval $DIST_PL project.groupId)
 ART_VER=$(mvnEval $DIST_PL project.version)
 DEFAULT_CUDA_CLASSIFIER=$(mvnEval $DIST_PL cuda.version)
 CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"}
+CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"} # default as CUDA_CLASSIFIERS for compatibility
 
 SQL_PL=${SQL_PL:-"sql-plugin"}
 POM_FILE=${POM_FILE:-"$DIST_PL/target/parallel-world/META-INF/maven/${ART_GROUP_ID}/${ART_ID}/pom.xml"}
@@ -57,9 +59,8 @@ SIGN_TOOL=${SIGN_TOOL:-"gpg"}
 FPATH="$OUT_PATH/$ART_ID-$ART_VER"
 DEPLOY_TYPES=''
 DEPLOY_FILES=''
-IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS"
-DEPLOY_TYPES=$(echo $CUDA_CLASSIFIERS | sed -e 's;[^,]*;jar;g')
-DEPLOY_FILES=$(echo $CUDA_CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g")
+DEPLOY_TYPES=$(echo $CLASSIFIERS | sed -e 's;[^,]*;jar;g')
+DEPLOY_FILES=$(echo $CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g")
 
 # dist does not have javadoc and sources jars, use 'sql-plugin' instead
 source jenkins/version-def.sh >/dev/null 2&>1
@@ -103,4 +104,4 @@ $DEPLOY_CMD -DpomFile=$POM_FILE \
             -Djavadoc=$FPATH-javadoc.jar \
             -Dfiles=$DEPLOY_FILES \
             -Dtypes=$DEPLOY_TYPES \
-            -Dclassifiers=$CUDA_CLASSIFIERS
+            -Dclassifiers=$CLASSIFIERS
diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh
index 969837ee397..5d331686659 100755
--- a/jenkins/spark-nightly-build.sh
+++ b/jenkins/spark-nightly-build.sh
@@ -42,7 +42,8 @@ ART_GROUP_ID=$(mvnEval project.groupId)
 ART_VER=$(mvnEval project.version)
 DEFAULT_CUDA_CLASSIFIER=${DEFAULT_CUDA_CLASSIFIER:-$(mvnEval cuda.version)} # default cuda version
 CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"} # e.g. cuda11,cuda12
-IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS"
+CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"}  # default as CUDA_CLASSIFIERS for compatibility
+IFS=',' read -a CLASSIFIERS_ARR <<< "$CLASSIFIERS"
 TMP_PATH="/tmp/$(date '+%Y-%m-%d')-$$"
 
 DIST_FPATH="$DIST_PL/target/$ART_ID-$ART_VER-$DEFAULT_CUDA_CLASSIFIER"
@@ -72,7 +73,7 @@ function distWithReducedPom {
 
         deploy)
             mvnCmd="deploy:deploy-file"
-            if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then
+            if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then
               # try move tmp artifacts back to target folder for simplifying separate release process
               mv ${TMP_PATH}/${ART_ID}-${ART_VER}-*.jar ${DIST_PL}/target/
             fi
@@ -102,6 +103,13 @@ function distWithReducedPom {
 
 # option to skip unit tests. Used in our CI to separate test runs in parallel stages
 SKIP_TESTS=${SKIP_TESTS:-"false"}
+if [[ "${SKIP_TESTS}" == "true" ]]; then
+  # if skip test, we could try speed up build with multiple-threads
+  MVN="${MVN} -T1C"
+fi
+
+set +H # turn off history expansion
+DEPLOY_SUBMODULES=${DEPLOY_SUBMODULES:-"!${DIST_PL}"} # TODO: deploy only required submodules to save time
 for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
     $MVN -U -B clean install $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \
         -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \
@@ -117,33 +125,44 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do
     fi
     distWithReducedPom "install"
     [[ $SKIP_DEPLOY != 'true' ]] && \
-        $MVN -B deploy -pl '!dist' $MVN_URM_MIRROR \
+        # this deploys selected submodules
+        $MVN -B deploy -pl $DEPLOY_SUBMODULES $MVN_URM_MIRROR \
             -Dmaven.repo.local=$M2DIR \
             -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \
             -DskipTests \
+            -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \
             -Dbuildver="${buildver}"
 done
 
 installDistArtifact() {
-  local cuda_classifier="$1"
+  local cuda_version="$1"
+  local opt="$2"
   $MVN -B clean install \
+      $opt \
       $DIST_PROFILE_OPT \
       -Dbuildver=$SPARK_BASE_SHIM_VERSION \
       $MVN_URM_MIRROR \
       -Dmaven.repo.local=$M2DIR \
-      -Dcuda.version=$cuda_classifier \
+      -Dcuda.version=$cuda_version \
       -DskipTests=$SKIP_TESTS
 }
 
 # build extra cuda classifiers
-if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then
+if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then
   mkdir -p ${TMP_PATH}
-  for classifier in "${CUDA_CLASSIFIERS_ARR[@]}"; do
+  for classifier in "${CLASSIFIERS_ARR[@]}"; do
     if [ "${classifier}" == "${DEFAULT_CUDA_CLASSIFIER}" ]; then
       echo "skip default: ${DEFAULT_CUDA_CLASSIFIER} in build extra cuda classifiers step..."
       continue
     fi
-    installDistArtifact ${classifier}
+
+    opt=""
+    if [[ "${classifier}" == *"-arm64" ]]; then
+      opt="-Parm64"
+    fi
+    # pass cuda version and extra opt
+    installDistArtifact ${classifier%%-*} ${opt}
+
     # move artifacts to temp for deployment later
     artifactFile="${ART_ID}-${ART_VER}-${classifier}.jar"
     mv ${DIST_PL}/target/${artifactFile} ${TMP_PATH}/
@@ -161,10 +180,11 @@ distWithReducedPom "install"
 if [[ $SKIP_DEPLOY != 'true' ]]; then
     distWithReducedPom "deploy"
 
-    # this deploys submodules except dist that is unconditionally built with Spark 3.1.1
-    $MVN -B deploy -pl '!dist' \
+    # this deploys selected submodules that is unconditionally built with Spark 3.1.1
+    $MVN -B deploy -pl $DEPLOY_SUBMODULES \
         -Dbuildver=$SPARK_BASE_SHIM_VERSION \
-        -DskipTests=$SKIP_TESTS \
+        -DskipTests \
+        -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \
         $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \
         -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER
 fi
diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh
index ebf75617d99..98894c0d548 100755
--- a/jenkins/version-def.sh
+++ b/jenkins/version-def.sh
@@ -26,10 +26,10 @@ for VAR in $OVERWRITE_PARAMS; do
 done
 IFS=$PRE_IFS
 
-CUDF_VER=${CUDF_VER:-"23.12.0-SNAPSHOT"}
+CUDF_VER=${CUDF_VER:-"24.02.0-SNAPSHOT"}
 CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"}
-PROJECT_VER=${PROJECT_VER:-"23.12.0-SNAPSHOT"}
-PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.12.0-SNAPSHOT"}
+PROJECT_VER=${PROJECT_VER:-"24.02.0-SNAPSHOT"}
+PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.02.0-SNAPSHOT"}
 SPARK_VER=${SPARK_VER:-"3.1.1"}
 SPARK_VER_213=${SPARK_VER_213:-"3.3.0"}
 # Make a best attempt to set the default value for the shuffle shim.
diff --git a/pom.xml b/pom.xml
index e8086a35d06..744e9dd2985 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -660,8 +660,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>24.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>24.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.12</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>
@@ -771,7 +771,8 @@
         <databricks.buildvers>
             321db,
             330db,
-            332db
+            332db,
+            341db
         </databricks.buildvers>
         <!--
           Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)
diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml
index abda9688ef8..00aef6d4627 100644
--- a/scala2.13/aggregator/pom.xml
+++ b/scala2.13/aggregator/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>aggregator</rapids.module>
diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml
index 02dbbf7017f..f4f6afeb861 100644
--- a/scala2.13/api_validation/pom.xml
+++ b/scala2.13/api_validation/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-api-validation_2.13</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>api_validation</rapids.module>
diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml
index 5982ab5eeff..468c8327f51 100644
--- a/scala2.13/datagen/pom.xml
+++ b/scala2.13/datagen/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>datagen_2.13</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>datagen</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml
index 688d0154734..d1634c0850a 100644
--- a/scala2.13/delta-lake/delta-20x/pom.xml
+++ b/scala2.13/delta-lake/delta-20x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-20x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml
index 8a5b5d0b8f4..da8c43b44e3 100644
--- a/scala2.13/delta-lake/delta-21x/pom.xml
+++ b/scala2.13/delta-lake/delta-21x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-21x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml
index 58d417bb1ed..4d9f6b5f0c5 100644
--- a/scala2.13/delta-lake/delta-22x/pom.xml
+++ b/scala2.13/delta-lake/delta-22x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-22x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml
index 6193d34ab44..209e07f55d5 100644
--- a/scala2.13/delta-lake/delta-23x/pom.xml
+++ b/scala2.13/delta-lake/delta-23x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-23x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support</name>
     <description>Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-23x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml
index 6aa94f5a546..e0652b725ac 100644
--- a/scala2.13/delta-lake/delta-24x/pom.xml
+++ b/scala2.13/delta-lake/delta-24x/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-24x</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark321db/pom.xml b/scala2.13/delta-lake/delta-spark321db/pom.xml
index c0c0bbc0385..30734e3fd14 100644
--- a/scala2.13/delta-lake/delta-spark321db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark321db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark321db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml
index 9ba4fd9f742..df1ad622d85 100644
--- a/scala2.13/delta-lake/delta-spark330db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark330db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark330db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml
index 506e2d392c7..56401c437e2 100644
--- a/scala2.13/delta-lake/delta-spark332db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark332db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-spark332db</rapids.module>
diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml
index e8d7d0dd644..6bc0a33a51a 100644
--- a/scala2.13/delta-lake/delta-spark341db/pom.xml
+++ b/scala2.13/delta-lake/delta-spark341db/pom.xml
@@ -22,14 +22,14 @@
     <parent>
 	<groupId>com.nvidia</groupId>
 	<artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark341db_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support</name>
     <description>Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml
index 2a0549cbbab..f62d6f653b2 100644
--- a/scala2.13/delta-lake/delta-stub/pom.xml
+++ b/scala2.13/delta-lake/delta-stub/pom.xml
@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../jdk-profiles/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../delta-lake/delta-stub</rapids.module>
diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml
index 071ce8247b5..20f19acf06a 100644
--- a/scala2.13/dist/pom.xml
+++ b/scala2.13/dist/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml
index a896a2773a3..3157095dca2 100644
--- a/scala2.13/integration_tests/pom.xml
+++ b/scala2.13/integration_tests/pom.xml
@@ -22,11 +22,11 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-integration-tests_2.13</artifactId>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>integration_tests</rapids.module>
         <target.classifier/>
diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml
index 5e730d89469..52d9bd68f87 100644
--- a/scala2.13/jdk-profiles/pom.xml
+++ b/scala2.13/jdk-profiles/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
     </parent>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim JDK Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>jdk9plus</id>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index e065c698522..7b09cf61707 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -23,7 +23,7 @@
     <artifactId>rapids-4-spark-parent_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Root Project</name>
     <description>The root project of the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <packaging>pom</packaging>
 
     <url>https://nvidia.github.io/spark-rapids/</url>
@@ -660,8 +660,8 @@
         <spark.version.classifier>spark${buildver}</spark.version.classifier>
         <cuda.version>cuda11</cuda.version>
         <jni.classifier>${cuda.version}</jni.classifier>
-        <spark-rapids-jni.version>23.12.0-SNAPSHOT</spark-rapids-jni.version>
-        <spark-rapids-private.version>23.12.0-SNAPSHOT</spark-rapids-private.version>
+        <spark-rapids-jni.version>24.02.0-SNAPSHOT</spark-rapids-jni.version>
+        <spark-rapids-private.version>24.02.0-SNAPSHOT</spark-rapids-private.version>
         <scala.binary.version>2.13</scala.binary.version>
         <alluxio.client.version>2.8.0</alluxio.client.version>
         <scala.recompileMode>incremental</scala.recompileMode>
@@ -771,7 +771,8 @@
         <databricks.buildvers>
             321db,
             330db,
-            332db
+            332db,
+            341db
         </databricks.buildvers>
         <!--
           Build and run unit tests on one specific version for each sub-version (e.g. 311, 320, 330)
diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml
index 80b46920047..86a300fe9fc 100644
--- a/scala2.13/shim-deps/cloudera/pom.xml
+++ b/scala2.13/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml
index d9fc8a42b83..6c2c9254fbd 100644
--- a/scala2.13/shim-deps/databricks/pom.xml
+++ b/scala2.13/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/scala2.13/shim-deps/pom.xml b/scala2.13/shim-deps/pom.xml
index 0b53b9d7bf2..8265e726ee0 100644
--- a/scala2.13/shim-deps/pom.xml
+++ b/scala2.13/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml
index 7432ac01a9c..47416b67a01 100644
--- a/scala2.13/shuffle-plugin/pom.xml
+++ b/scala2.13/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-      <version>23.12.0-SNAPSHOT</version>
+      <version>24.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml
index d885f745cdb..b6a97b9e678 100644
--- a/scala2.13/sql-plugin-api/pom.xml
+++ b/scala2.13/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.13</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml
index ee849082aa9..714f05279dd 100644
--- a/scala2.13/sql-plugin/pom.xml
+++ b/scala2.13/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/scala2.13/tests/pom.xml b/scala2.13/tests/pom.xml
index 3d50da1fdbc..0223ff96752 100644
--- a/scala2.13/tests/pom.xml
+++ b/scala2.13/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml
index 4920c358d42..601750b8af5 100644
--- a/scala2.13/udf-compiler/pom.xml
+++ b/scala2.13/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.13</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.13</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>
diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml
index f7f10b0ab56..c72c86a4d31 100644
--- a/shim-deps/cloudera/pom.xml
+++ b/shim-deps/cloudera/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-cdh-bom</artifactId>
     <packaging>pom</packaging>
     <description>CDH Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/cloudera</rapids.module>
diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml
index efcabc3e72b..1e14d255bcc 100644
--- a/shim-deps/databricks/pom.xml
+++ b/shim-deps/databricks/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-db-bom</artifactId>
     <packaging>pom</packaging>
     <description>Databricks Shim Dependencies</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>../shim-deps/databricks</rapids.module>
diff --git a/shim-deps/pom.xml b/shim-deps/pom.xml
index 11b88fff789..0d63398c1c8 100644
--- a/shim-deps/pom.xml
+++ b/shim-deps/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-jdk-profiles_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../jdk-profiles/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
     <packaging>pom</packaging>
     <description>Shim Dependencies Profiles</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <profiles>
         <profile>
             <id>release321cdh</id>
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index fe7894874da..d95556b200d 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -21,13 +21,13 @@
     <parent>
       <groupId>com.nvidia</groupId>
       <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-      <version>23.12.0-SNAPSHOT</version>
+      <version>24.02.0-SNAPSHOT</version>
       <relativePath>../shim-deps/pom.xml</relativePath>
   </parent>
     <artifactId>rapids-4-spark-shuffle_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Shuffle Plugin</name>
     <description>Accelerated shuffle plugin for the RAPIDS plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>shuffle-plugin</rapids.module>
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index bb22fb7ba9d..6a8336f2a4a 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -108,8 +108,8 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
         .setNameFormat("progress-thread-%d")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // The pending queues are used to enqueue [[PendingReceive]] or [[PendingSend]], from executor
   // task threads and [[progressThread]] will hand them to the UcpWorker thread.
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index 9130d16b945..3a31ae709ec 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -250,8 +250,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
         .setNameFormat("shuffle-transport-client-exec-%d")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()),
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()),
     // if we can't hand off because we are too busy, block the caller (in UCX's case,
     // the progress thread)
     new CallerRunsAndLogs())
@@ -262,8 +262,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat("shuffle-client-copy-thread-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   override def makeClient(blockManagerId: BlockManagerId): RapidsShuffleClient = {
     val peerExecutorId = blockManagerId.executorId.toLong
@@ -286,8 +286,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat(s"shuffle-server-conn-thread-${shuffleServerId.executorId}-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // This is used to queue up on the server all the [[BufferSendState]] as the server waits for
   // bounce buffers to become available (it is the equivalent of the transport's throttle, minus
@@ -297,8 +297,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
       .setNameFormat(s"shuffle-server-bss-thread-%d")
       .setDaemon(true)
       .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   /**
    * Construct a server instance
@@ -359,8 +359,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
         .setNameFormat(s"shuffle-transport-throttle-monitor")
         .setDaemon(true)
         .build,
-      () => RmmSpark.associateCurrentThreadWithShuffle(),
-      () => RmmSpark.removeCurrentThreadAssociation()))
+      null,
+      () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   // helper class to hold transfer requests that have a bounce buffer
   // and should be ready to be handled by a `BufferReceiveState`
diff --git a/sql-plugin-api/pom.xml b/sql-plugin-api/pom.xml
index 1ccc411ca30..47f6a84a31d 100644
--- a/sql-plugin-api/pom.xml
+++ b/sql-plugin-api/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-sql-plugin-api_2.12</artifactId>
     <description>Module for Non-Shimmable API</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
     <properties>
         <rapids.module>sql-plugin-api</rapids.module>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>
diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
index 3723575810b..47a37278f9b 100644
--- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
+++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -49,11 +49,11 @@ import org.apache.spark.util.MutableURLClassLoader
     Each shim can see a consistent parallel world without conflicts by referencing
     only one conflicting directory.
     E.g., Spark 3.2.0 Shim will use
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark320/
     Spark 3.1.1 will use
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
-    jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark311/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark311/
     Using these Jar URL's allows referencing different bytecode produced from identical sources
     by incompatible Scala / Spark dependencies.
  */
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index 0a7fb1ff8c1..8953f391dd3 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -22,13 +22,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-sql_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark SQL Plugin</name>
     <description>The RAPIDS SQL plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>sql-plugin</rapids.module>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
index 77c3bbae7b3..cee705d8f8e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/AbstractGpuJoinIterator.scala
@@ -144,7 +144,7 @@ abstract class AbstractGpuJoinIterator(
         // This withRetry block will always return an iterator with one ColumnarBatch.
         // The gatherer tracks how many rows we have used already.  The withRestoreOnRetry
         // ensures that we restart at the same place in the gatherer.  In the case of a
-        // SplitAndRetryOOM, we retry with a smaller (halved) targetSize, so we are taking
+        // GpuSplitAndRetryOOM, we retry with a smaller (halved) targetSize, so we are taking
         // less from the gatherer, but because the gatherer tracks how much is used, the
         // next call to this function will start in the right place.
         gather.checkpoint()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
index 6d640683c07..f6429ddd709 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/BatchWithPartitionData.scala
@@ -22,7 +22,7 @@ import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.withRetry
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.{StringType, StructType}
@@ -500,8 +500,8 @@ object BatchWithPartitionDataUtils {
       withResource(batchWithPartData) { _ =>
         // Split partition rows data into two halves
         val splitPartitionData = splitPartitionDataInHalf(batchWithPartData.partitionedRowsData)
-        if(splitPartitionData.length < 2) {
-          throw new SplitAndRetryOOM("GPU OutOfMemory: cannot split input with one row")
+        if (splitPartitionData.length < 2) {
+          throw new GpuSplitAndRetryOOM("GPU OutOfMemory: cannot split input with one row")
         }
         // Split the batch into two halves
         withResource(batchWithPartData.inputBatch.getColumnarBatch()) { cb =>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index 23145e56153..e6dc216d7e6 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -23,7 +23,7 @@ import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{withRetry, withRetryNoSplit}
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import com.nvidia.spark.rapids.shims.{ShimExpression, ShimUnaryExecNode}
 
 import org.apache.spark.TaskContext
@@ -671,7 +671,7 @@ abstract class AbstractGpuCoalesceIterator(
         val it = batchesToCoalesce.batches
         val numBatches = it.length
         if (numBatches <= 1) {
-          throw new SplitAndRetryOOM(s"Cannot split a sequence of $numBatches batches")
+          throw new GpuSplitAndRetryOOM(s"Cannot split a sequence of $numBatches batches")
         }
         val res = it.splitAt(numBatches / 2)
         Seq(BatchesToCoalesce(res._1), BatchesToCoalesce(res._2))
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
index 7a8a028a7f2..10761eef47b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDeviceManager.scala
@@ -386,7 +386,7 @@ object GpuDeviceManager extends Logging {
 
   private def initializeOffHeapLimits(gpuId: Int, rapidsConf: Option[RapidsConf]): Unit = {
     val conf = rapidsConf.getOrElse(new RapidsConf(SparkEnv.get.conf))
-    val pinnedSize = if (conf.offHeapLimitEnabled) {
+    val (pinnedSize, nonPinnedLimit) = if (conf.offHeapLimitEnabled) {
       logWarning("OFF HEAP MEMORY LIMITS IS ENABLED. " +
           "THIS IS EXPERIMENTAL FOR NOW USE WITH CAUTION")
       val perTaskOverhead = conf.perTaskOverhead
@@ -425,9 +425,9 @@ object GpuDeviceManager extends Logging {
       } else {
         memoryLimit
       }
-      // TODO need to configure the limits when we have those APIs available, and log what those
-      //  limits are
-      if (confPinnedSize + totalOverhead <= finalMemoryLimit) {
+
+      // Now we need to know the pinned vs non-pinned limits
+      val pinnedLimit = if (confPinnedSize + totalOverhead <= finalMemoryLimit) {
         confPinnedSize
       } else {
         val ret = finalMemoryLimit - totalOverhead
@@ -437,13 +437,23 @@ object GpuDeviceManager extends Logging {
             s"dropping pinned memory to ${ret / 1024 / 1024.0} MiB")
         ret
       }
+      val nonPinnedLimit = finalMemoryLimit - totalOverhead - pinnedLimit
+      logWarning(s"Off Heap Host Memory configured to be " +
+          s"${pinnedLimit / 1024 / 1024.0} MiB pinned, " +
+          s"${nonPinnedLimit / 1024 / 1024.0} MiB non-pinned, and " +
+          s"${totalOverhead / 1024 / 1024.0} MiB of untracked overhead.")
+      (pinnedLimit, nonPinnedLimit)
     } else {
-      conf.pinnedPoolSize
+      (conf.pinnedPoolSize, -1L)
     }
     if (!PinnedMemoryPool.isInitialized && pinnedSize > 0) {
       logInfo(s"Initializing pinned memory pool (${pinnedSize / 1024 / 1024.0} MiB)")
       PinnedMemoryPool.initialize(pinnedSize, gpuId)
     }
+    if (nonPinnedLimit >= 0) {
+      // Host memory limits must be set after the pinned memory pool is initialized
+      HostAlloc.initialize(nonPinnedLimit)
+    }
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index e4af2e4b6de..3aa3a2d48a7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -30,7 +30,7 @@ import ai.rapids.cudf.{HostMemoryBuffer, NvtxColor, NvtxRange, Table}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.GpuMetric.{BUFFER_TIME, FILTER_TIME}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -1029,7 +1029,7 @@ abstract class MultiFileCoalescingPartitionReaderBase(
    * Set this to a splitter instance when chunked reading is supported
    */
   def chunkedSplit(buffer: HostMemoryBuffer): Seq[HostMemoryBuffer] = {
-    throw new SplitAndRetryOOM("Split is not currently supported")
+    throw new GpuSplitAndRetryOOM("Split is not currently supported")
   }
 
   /**
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
index 207b6ddaa9b..fcc6c20a42c 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -37,7 +37,7 @@ import com.nvidia.spark.rapids.ParquetPartitionReader.{CopyRange, LocalCopy}
 import com.nvidia.spark.rapids.RapidsConf.ParquetFooterReaderType
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.filecache.FileCache
-import com.nvidia.spark.rapids.jni.{DateTimeRebase, ParquetFooter, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{DateTimeRebase, GpuSplitAndRetryOOM, ParquetFooter}
 import com.nvidia.spark.rapids.shims.{ColumnDefaultValuesShims, GpuParquetCrypto, GpuTypeShims, ParquetLegacyNanoAsLongShims, ParquetSchemaClipShims, ParquetStringPredShims, ReaderUtils, ShimFilePartitionReaderFactory, SparkShimImpl}
 import org.apache.commons.io.IOUtils
 import org.apache.commons.io.output.{CountingOutputStream, NullOutputStream}
@@ -248,7 +248,7 @@ object GpuParquetScan {
 
   /**
    * Check that we can split the targetBatchSize and then return a split targetBatchSize. This
-   * is intended to be called from the SplitAndRetryOOM handler for all implementations of
+   * is intended to be called from the GpuSplitAndRetryOOM handler for all implementations of
    * the parquet reader
    * @param targetBatchSize the current target batch size.
    * @param useChunkedReader if chunked reading is enabled. This only works if chunked reading is
@@ -257,13 +257,13 @@ object GpuParquetScan {
    */
   def splitTargetBatchSize(targetBatchSize: Long, useChunkedReader: Boolean): Long = {
     if (!useChunkedReader) {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs " +
-        "chunked parquet reader is configured off")
+      throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs " +
+          "chunked parquet reader is configured off")
     }
     val ret = targetBatchSize / 2
     if (targetBatchSize < minTargetBatchSizeMiB * 1024 * 1024) {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split input " +
-        s"target batch size to less than $minTargetBatchSizeMiB MiB")
+           throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split input " +
+          s"target batch size to less than $minTargetBatchSizeMiB MiB")
     }
     ret
   }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
index e3140e1f392..84baf7e9708 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuSemaphore.scala
@@ -24,7 +24,6 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{NvtxColor, NvtxRange}
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -269,6 +268,8 @@ private final class GpuSemaphore() extends Logging {
   private val tasks = new ConcurrentHashMap[Long, SemaphoreTaskInfo]
 
   def acquireIfNecessary(context: TaskContext): Unit = {
+    // Make sure that the thread/task is registered before we try and block
+    TaskRegistryTracker.registerThreadForRetry()
     GpuTaskMetrics.get.semWaitTime {
       val taskAttemptId = context.taskAttemptId()
       val taskInfo = tasks.computeIfAbsent(taskAttemptId, _ => {
@@ -276,7 +277,6 @@ private final class GpuSemaphore() extends Logging {
         new SemaphoreTaskInfo()
       })
       taskInfo.blockUntilReady(semaphore)
-      RmmSpark.associateCurrentThreadWithTask(taskAttemptId)
       GpuDeviceManager.initializeFromTask()
     }
   }
@@ -286,7 +286,6 @@ private final class GpuSemaphore() extends Logging {
     try {
       val taskAttemptId = context.taskAttemptId()
       GpuTaskMetrics.get.updateRetry(taskAttemptId)
-      RmmSpark.removeCurrentThreadAssociation()
       val taskInfo = tasks.get(taskAttemptId)
       if (taskInfo != null) {
         taskInfo.releaseSemaphore(semaphore)
@@ -299,7 +298,6 @@ private final class GpuSemaphore() extends Logging {
   def completeTask(context: TaskContext): Unit = {
     val taskAttemptId = context.taskAttemptId()
     GpuTaskMetrics.get.updateRetry(taskAttemptId)
-    RmmSpark.taskDone(taskAttemptId)
     val refs = tasks.remove(taskAttemptId)
     if (refs == null) {
       throw new IllegalStateException(s"Completion of unknown task $taskAttemptId")
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
index 587bffe7ebc..3da4a1191b1 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/HostAlloc.scala
@@ -16,16 +16,13 @@
 
 package com.nvidia.spark.rapids
 
-import java.util.Comparator
+import ai.rapids.cudf.{HostMemoryBuffer, MemoryBuffer, PinnedMemoryPool}
+import com.nvidia.spark.rapids.jni.RmmSpark
 
-import ai.rapids.cudf.{ColumnView, HostMemoryBuffer, HostMemoryReservation, MemoryBuffer, PinnedMemoryPool}
-import com.nvidia.spark.rapids.HostAlloc.align
+import org.apache.spark.internal.Logging
 
-import org.apache.spark.TaskContext
-
-private class HostAlloc(nonPinnedLimit: Long) {
+private class HostAlloc(nonPinnedLimit: Long) extends Logging {
   private var currentNonPinnedAllocated: Long = 0L
-  private var currentNonPinnedReserved: Long = 0L
   private val pinnedLimit: Long = PinnedMemoryPool.getTotalPoolSizeBytes
   // For now we are going to assume that we are the only ones calling into the pinned pool
   // That is not really true, but should be okay.
@@ -33,52 +30,13 @@ private class HostAlloc(nonPinnedLimit: Long) {
   private val isUnlimited = nonPinnedLimit < 0
   private val isPinnedOnly = nonPinnedLimit == 0
 
-  private val compareBlocks = new Comparator[BlockedAllocation] {
-    override def compare(a: BlockedAllocation, b: BlockedAllocation): Int = {
-      java.lang.Long.compare(a.taskId, b.taskId)
-    }
-  }
-
-  /**
-   * Host memory allocations that are still pending.
-   */
-  private val pendingAllowedQueue = new HashedPriorityQueue[BlockedAllocation](100, compareBlocks)
-
-  /**
-   * An allocation that has not been completed yet. It is blocked waiting for more resources.
-   */
-  private class BlockedAllocation(val amount: Long, val taskId: Long) {
-    private var shouldWake = false
-
-    def isReady: Boolean = shouldWake
-
-    /**
-     * Wait until we should retry the allocation because it might succeed. It is not
-     * guaranteed though.
-     * It is required that the parent lock is held before this is called.
-     */
-    def waitUntilPossiblyReady(): Unit = {
-      while (!shouldWake) {
-        HostAlloc.this.wait(1000)
-      }
-    }
-
-    /**
-     * Wake up all threads that are blocked waiting for an allocation.
-     */
-    def wakeUpItMightBeWorthIt(): Unit = {
-      shouldWake = true
-      HostAlloc.this.notifyAll()
-    }
-  }
-
   /**
    * A callback class so we know when a non-pinned host buffer was released
    */
-  private class OnCloseCallback(amount: Long) extends MemoryBuffer.EventHandler {
+  private class OnCloseCallback(ptr: Long, amount: Long) extends MemoryBuffer.EventHandler {
     override def onClosed(refCount: Int): Unit = {
       if (refCount == 0) {
-        releaseNonPinned(amount)
+        releaseNonPinned(ptr, amount)
       }
     }
   }
@@ -86,152 +44,26 @@ private class HostAlloc(nonPinnedLimit: Long) {
   /**
    * A callback so we know when a pinned host buffer was released.
    */
-  private class OnPinnedCloseCallback(amount: Long) extends MemoryBuffer.EventHandler {
+  private class OnPinnedCloseCallback(ptr: Long, amount: Long) extends MemoryBuffer.EventHandler {
     override def onClosed(refCount: Int): Unit = {
       if (refCount == 0) {
-        releasePinned(amount)
-      }
-    }
-  }
-
-  /**
-   * A wrapper around a pinned memory reservation so we can add in callbacks as needed.
-   */
-  private class WrappedPinnedReservation(val wrap: HostMemoryReservation)
-    extends HostMemoryReservation {
-
-    private def addEventHandlerAndUpdateMetrics(b: HostMemoryBuffer): HostMemoryBuffer =
-      synchronized {
-        val amount = b.getLength
-        currentPinnedAllocated += amount
-        // I need callbacks for the pinned
-        HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(amount))
-        b
-      }
-
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer =
-      addEventHandlerAndUpdateMetrics(wrap.allocate(amount, preferPinned))
-
-    override def allocate(amount: Long): HostMemoryBuffer =
-      addEventHandlerAndUpdateMetrics(wrap.allocate(amount))
-
-    override def close(): Unit = wrap.close()
-  }
-
-  /**
-   * A non-pinned host memory reservation.
-   */
-  private class NonPinnedReservation(var reservedAmount: Long) extends HostMemoryReservation {
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer = {
-      allocate(amount)
-    }
-
-    override def allocate(amount: Long): HostMemoryBuffer = synchronized {
-      if (amount > reservedAmount) {
-        throw new OutOfMemoryError("Could not allocate. Remaining memory reservation is " +
-          s"too small $amount out of $reservedAmount")
-      }
-      val buff = allocNonPinnedFromReserved(amount)
-      reservedAmount -= align(buff.getLength)
-      buff
-    }
-
-    override def close(): Unit = synchronized {
-      releaseNonPinnedReservation(reservedAmount)
-      reservedAmount = 0
-    }
-  }
-
-  /**
-   * A reservation for the special mode when there are no host memory limits.
-   */
-  private object UnlimitedReservation extends HostMemoryReservation {
-    override def allocate(amount: Long, preferPinned: Boolean): HostMemoryBuffer =
-      HostAlloc.alloc(amount, preferPinned)
-
-    override def allocate(amount: Long): HostMemoryBuffer =
-      HostAlloc.alloc(amount)
-
-    override def close(): Unit = {
-      // NOOP
-    }
-  }
-
-  /**
-   * Wake up any blocked allocation that are still pending up to the amount that has been freed.
-   * Note that this assume that there is no fragmentation that might prevent an allocation from
-   * succeeding.
-   * @param amountLeftToWakeInput the amount of memory that is available in bytes.
-   * @return true if anything was woken up, else false.
-   */
-  private def wakeUpAsNeeded(amountLeftToWakeInput: Long): Boolean = synchronized {
-    var amountLeftToWake = amountLeftToWakeInput
-    var ret = false
-    while (amountLeftToWake > 0 && !pendingAllowedQueue.isEmpty) {
-      val peek = pendingAllowedQueue.peek()
-      if (peek.amount <= amountLeftToWake) {
-        val head = pendingAllowedQueue.poll()
-        amountLeftToWake -= head.amount
-        head.wakeUpItMightBeWorthIt()
-        ret = true
-      } else {
-        return ret
+        releasePinned(ptr, amount)
       }
     }
-    ret
-  }
-
-  private def wakeUpPinned(): Boolean = synchronized {
-    val amountLeftToWake = pinnedLimit - currentPinnedAllocated
-    wakeUpAsNeeded(amountLeftToWake)
-  }
-
-  private def wakeUpNonPinned(): Boolean = synchronized {
-    val amountLeftToWake = nonPinnedLimit - (currentNonPinnedAllocated + currentNonPinnedReserved)
-    wakeUpAsNeeded(amountLeftToWake)
-  }
-
-  private def releasePinned(amount: Long): Unit = synchronized {
-    currentPinnedAllocated -= amount
-    if (wakeUpPinned()) {
-      wakeUpNonPinned()
-    }
-  }
-
-  private def releaseNonPinned(amount: Long): Unit = synchronized {
-    currentNonPinnedAllocated -= amount
-    if (wakeUpNonPinned()) {
-      wakeUpPinned()
-    }
   }
 
-  private def releaseNonPinnedReservation(reservedAmount: Long): Unit = synchronized {
-    currentNonPinnedReserved -= reservedAmount
-    if (wakeUpPinned()) {
-      wakeUpNonPinned()
+  private def releasePinned(ptr: Long, amount: Long): Unit = {
+    synchronized {
+      currentPinnedAllocated -= amount
     }
+    RmmSpark.cpuDeallocate(ptr, amount)
   }
 
-  private def tryReservePinned(amount: Long): Option[HostMemoryReservation] = {
-    val ret = Option(PinnedMemoryPool.tryReserve(amount))
-    ret.map { reservation =>
-      new WrappedPinnedReservation(reservation)
-    }
-  }
-
-  private def tryReserveNonPinned(amount: Long): Option[HostMemoryReservation] = {
-    if (isUnlimited) {
-      Some(UnlimitedReservation)
-    } else {
-      synchronized {
-        if ((currentNonPinnedAllocated + currentNonPinnedReserved + amount) <= nonPinnedLimit) {
-          currentNonPinnedReserved += amount
-          Some(new NonPinnedReservation(amount))
-        } else {
-          None
-        }
-      }
+  private def releaseNonPinned(ptr: Long, amount: Long): Unit = {
+    synchronized {
+      currentNonPinnedAllocated -= amount
     }
+    RmmSpark.cpuDeallocate(ptr, amount)
   }
 
   private def tryAllocPinned(amount: Long): Option[HostMemoryBuffer] = {
@@ -240,34 +72,20 @@ private class HostAlloc(nonPinnedLimit: Long) {
       synchronized {
         currentPinnedAllocated += amount
       }
-      HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(amount))
+      HostAlloc.addEventHandler(b, new OnPinnedCloseCallback(b.getAddress, amount))
     }
     ret
   }
 
-  private def allocNonPinnedFromReserved(amount: Long): HostMemoryBuffer = {
+  private def tryAllocNonPinned(amount: Long): Option[HostMemoryBuffer] = {
     val ret = if (isUnlimited) {
-      HostMemoryBuffer.allocate(amount, false)
-    } else {
       synchronized {
-        currentNonPinnedReserved -= amount
         currentNonPinnedAllocated += amount
-        HostMemoryBuffer.allocate(amount, false)
       }
-    }
-    if (ret == null) {
-      throw new OutOfMemoryError(s"Internal Error: could not allocate non-pinned memory $amount")
-    }
-
-    HostAlloc.addEventHandler(ret, new OnCloseCallback(amount))
-  }
-
-  private def tryAllocNonPinned(amount: Long): Option[HostMemoryBuffer] = {
-    val ret = if (isUnlimited) {
       Some(HostMemoryBuffer.allocate(amount, false))
     } else {
       synchronized {
-        if ((currentNonPinnedAllocated + currentNonPinnedReserved + amount) <= nonPinnedLimit) {
+        if ((currentNonPinnedAllocated + amount) <= nonPinnedLimit) {
           currentNonPinnedAllocated += amount
           Some(HostMemoryBuffer.allocate(amount, false))
         } else {
@@ -276,117 +94,129 @@ private class HostAlloc(nonPinnedLimit: Long) {
       }
     }
     ret.foreach { b =>
-      HostAlloc.addEventHandler(b, new OnCloseCallback(amount))
+      HostAlloc.addEventHandler(b, new OnCloseCallback(b.getAddress, amount))
     }
     ret
   }
 
-  private def canNeverSucceed(amount: Long, preferPinned: Boolean): Boolean = {
+  private def canNeverSucceed(amount: Long, preferPinned: Boolean): Boolean = synchronized {
     val pinnedFailed = (isPinnedOnly || preferPinned) && (amount > pinnedLimit)
     val nonPinnedFailed = isPinnedOnly || (amount > nonPinnedLimit)
     !isUnlimited && pinnedFailed && nonPinnedFailed
   }
 
-  private def checkSize(amount: Long, preferPinned: Boolean): Unit = {
+  private def checkSize(amount: Long, preferPinned: Boolean): Unit = synchronized {
     if (canNeverSucceed(amount, preferPinned)) {
       throw new IllegalArgumentException(s"The amount requested $amount is larger than the " +
           s"maximum pool size ${math.max(pinnedLimit, nonPinnedLimit)}")
     }
   }
 
-  def tryAlloc(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
-    checkSize(amount, preferPinned)
-    val firstPass = if (preferPinned) {
-      tryAllocPinned(amount)
+  private def spillAndCheckRetry(allocSize: Long, retryCount: Long): Boolean = {
+    // check arguments for good measure
+    require(allocSize >= 0,
+      s"spillAndCheckRetry invoked with invalid allocSize $allocSize")
+
+    require(retryCount >= 0,
+      s"spillAndCheckRetry invoked with invalid retryCount $retryCount")
+
+    val store = RapidsBufferCatalog.getHostStorage
+    val storeSize = store.currentSize
+    val storeSpillableSize = store.currentSpillableSize
+    val totalSize: Long = synchronized {
+      currentPinnedAllocated + currentNonPinnedAllocated
+    }
+
+    val attemptMsg = if (retryCount > 0) {
+      s"Attempt $retryCount"
     } else {
-      tryAllocNonPinned(amount)
+      "First attempt"
     }
-    firstPass.orElse {
-      if (preferPinned) {
-        tryAllocNonPinned(amount)
-      } else {
-        tryAllocPinned(amount)
+
+    logInfo(s"Host allocation of $allocSize bytes failed, host store has " +
+        s"$storeSize total and $storeSpillableSize spillable bytes. $attemptMsg.")
+    if (storeSpillableSize == 0) {
+      logWarning(s"Host store exhausted, unable to allocate $allocSize bytes. " +
+          s"Total RMM allocated is $totalSize bytes.")
+      false
+    } else {
+      val targetSize = Math.max(storeSpillableSize - allocSize, 0)
+      logDebug(s"Targeting host store size of $targetSize bytes")
+      // We could not make it work so try and spill enough to make it work
+      val maybeAmountSpilled =
+        RapidsBufferCatalog.synchronousSpill(RapidsBufferCatalog.getHostStorage, allocSize)
+      maybeAmountSpilled.foreach { amountSpilled =>
+        logInfo(s"Spilled $amountSpilled bytes from the host store")
       }
+      true
     }
   }
 
-  def alloc(amount: Long, preferPinned: Boolean = true): HostMemoryBuffer = synchronized {
-    var ret: Option[HostMemoryBuffer] = None
-    var blocked: BlockedAllocation = null
-    do {
-      ret = tryAlloc(amount, preferPinned)
-      if (ret.isEmpty) {
-        blocked = new BlockedAllocation(amount, TaskContext.get().taskAttemptId())
-        pendingAllowedQueue.offer(blocked)
-        var amountSpilled: Option[Long] = None
-        // None for amountSpilled means we need to retry because of a race.
-        // forall returns true for None in this case.
-        while(!blocked.isReady && amountSpilled.forall(_ > 0)) {
-          amountSpilled = RapidsBufferCatalog.synchronousSpill(
-            RapidsBufferCatalog.getHostStorage, amount)
+  private def tryAllocInternal(amount: Long,
+      preferPinned: Boolean,
+      blocking: Boolean): (Option[HostMemoryBuffer], Boolean) = {
+    var retryCount = 0L
+    var ret = Option.empty[HostMemoryBuffer]
+    var shouldRetry = false
+    var shouldRetryInternal = true
+    val isRecursive = RmmSpark.preCpuAlloc(amount, blocking)
+    var allocAttemptFinishedWithoutException = false
+    try {
+      do {
+        val firstPass = if (preferPinned) {
+          tryAllocPinned(amount)
+        } else {
+          tryAllocNonPinned(amount)
         }
-        // Wait until we think we are ready to allocate something
-        blocked.waitUntilPossiblyReady()
-      }
-    } while(ret.isEmpty)
-    ret.get
-  }
-
-  /**
-   * Allocate a buffer at the highest priority possible. If the allocation cannot happen
-   * for whatever reason a None is returned instead of blocking
-   */
-  def allocHighPriority(amount: Long,
-      preferPinned: Boolean = true): Option[HostMemoryBuffer] = synchronized {
-    var ret: Option[HostMemoryBuffer] = None
-    if (!canNeverSucceed(amount, preferPinned)) {
-      ret = tryAlloc(amount, preferPinned)
-      if (ret.isEmpty) {
-        val blocked = new BlockedAllocation(amount, Long.MinValue)
-        pendingAllowedQueue.offer(blocked)
-        var amountSpilled: Option[Long] = None
-        // None for amountSpilled means we need to retry because of a race.
-        // forall returns true for None in this case.
-        while (!blocked.isReady && amountSpilled.forall(_ > 0)) {
-          amountSpilled = RapidsBufferCatalog.synchronousSpill(
-            RapidsBufferCatalog.getHostStorage, amount)
+        ret = firstPass.orElse {
+          if (preferPinned) {
+            tryAllocNonPinned(amount)
+          } else {
+            tryAllocPinned(amount)
+          }
         }
-
-        if (blocked.isReady) {
-          ret = tryAlloc(amount, preferPinned)
-        } else {
-          pendingAllowedQueue.remove(blocked)
+        if (ret.isEmpty) {
+          // We could not make it work so try and spill enough to make it work
+          shouldRetryInternal = spillAndCheckRetry(amount, retryCount)
+          if (shouldRetryInternal) {
+            retryCount += 1
+          }
         }
+      } while(ret.isEmpty && shouldRetryInternal && retryCount < 10)
+      allocAttemptFinishedWithoutException = true
+    } finally {
+      if (ret.isDefined) {
+        RmmSpark.postCpuAllocSuccess(ret.get.getAddress, amount, blocking, isRecursive)
+      } else {
+        // shouldRetry should indicate if spill did anything for us and we should try again.
+        shouldRetry = RmmSpark.postCpuAllocFailed(allocAttemptFinishedWithoutException,
+          blocking, isRecursive)
       }
     }
+    (ret, shouldRetry)
+  }
+
+  def tryAlloc(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
+    if (canNeverSucceed(amount, preferPinned)) {
+      return None
+    }
+    var shouldRetry = true
+    var ret = Option.empty[HostMemoryBuffer]
+    while (shouldRetry) {
+      val (r, sr) = tryAllocInternal(amount, preferPinned, blocking = false)
+      ret = r
+      shouldRetry = sr
+    }
     ret
   }
 
-  def reserve(amount: Long, preferPinned: Boolean): HostMemoryReservation = synchronized {
-    var ret: Option[HostMemoryReservation] = None
-    var blocked: BlockedAllocation = null
-    do {
-      checkSize(amount, preferPinned)
-      val firstPass = if (preferPinned) {
-        tryReservePinned(amount)
-      } else {
-        tryReserveNonPinned(amount)
-      }
-      ret = firstPass.orElse {
-        if (preferPinned) {
-          tryReserveNonPinned(amount)
-        } else {
-          tryReservePinned(amount)
-        }
-      }
-      if (ret.isEmpty) {
-        if (blocked == null) {
-          blocked = new BlockedAllocation(amount, TaskContext.get().taskAttemptId())
-        }
-        pendingAllowedQueue.offer(blocked)
-        blocked.waitUntilPossiblyReady()
-      }
-    } while (ret.isEmpty)
+  def alloc(amount: Long, preferPinned: Boolean = true): HostMemoryBuffer = {
+    checkSize(amount, preferPinned)
+    var ret = Option.empty[HostMemoryBuffer]
+    while (ret.isEmpty) {
+      val (r, _) = tryAllocInternal(amount, preferPinned, blocking = true)
+      ret = r
+    }
     ret.get
   }
 }
@@ -395,11 +225,6 @@ private class HostAlloc(nonPinnedLimit: Long) {
  * A new API for host memory allocation. This can be used to limit the amount of host memory.
  */
 object HostAlloc {
-  private val ALIGNMENT = ColumnView.hostPaddingSizeInBytes
-  private def align(amount: Long): Long = {
-    ((amount + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT
-  }
-
   private var singleton: HostAlloc = new HostAlloc(-1)
 
   private def getSingleton: HostAlloc = synchronized {
@@ -418,20 +243,6 @@ object HostAlloc {
     getSingleton.alloc(amount, preferPinned)
   }
 
-  /**
-   * Allocate a HostMemoryBuffer, but at the highest priority. This will not block for a free. It
-   * may spill data to make room for the allocation, but it will do it at the highest priority.
-   * If we cannot make it work, then a None will be returned an whoever tries to use this needs
-   * a backup plan.
-   */
-  def allocHighPriority(amount: Long, preferPinned: Boolean = true): Option[HostMemoryBuffer] = {
-    getSingleton.allocHighPriority(amount, preferPinned)
-  }
-
-  def reserve(amount: Long, preferPinned: Boolean = true): HostMemoryReservation = {
-    getSingleton.reserve(amount, preferPinned)
-  }
-
   def addEventHandler(buff: HostMemoryBuffer,
                       handler: MemoryBuffer.EventHandler): HostMemoryBuffer = {
     buff.synchronized {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index 18cbd7a26bb..33ad80f48de 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids
 
 import java.lang.reflect.InvocationTargetException
+import java.net.URL
 import java.time.ZoneId
 import java.util.Properties
 
@@ -25,6 +26,7 @@ import scala.sys.process._
 import scala.util.Try
 
 import ai.rapids.cudf.{Cuda, CudaException, CudaFatalException, CudfException, MemoryCleaner}
+import com.nvidia.spark.rapids.RapidsConf.AllowMultipleJars
 import com.nvidia.spark.rapids.filecache.{FileCache, FileCacheLocalityManager, FileCacheLocalityMsg}
 import com.nvidia.spark.rapids.jni.GpuTimeZoneDB
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -113,6 +115,67 @@ object RapidsPluginUtils extends Logging {
     }
   }
 
+  private def detectMultipleJar(propName: String, jarName: String, conf: RapidsConf): Unit = {
+    val classloader = ShimLoader.getShimClassLoader()
+    val possibleRapidsJarURLs = classloader.getResources(propName).asScala.toSet.toSeq.filter {
+      url => {
+        val urlPath = url.toString
+        // Filter out submodule jars, e.g. rapids-4-spark-aggregator_2.12-23.12.0-spark341.jar,
+        // and files stored under subdirs of '!/', e.g. 
+        // rapids-4-spark_2.12-23.12.0-cuda11.jar!/spark330/rapids4spark-version-info.properties
+        // We only want to find the main jar, e.g.
+        // rapids-4-spark_2.12-23.12.0-cuda11.jar!/rapids4spark-version-info.properties
+        !urlPath.contains("rapids-4-spark-") && urlPath.endsWith("!/" + propName)
+      }
+    }
+    val revisionRegex = "revision=(.*)".r
+    val revisionMap: Map[String, Seq[URL]] = possibleRapidsJarURLs.map { url =>
+      val versionInfo = scala.io.Source.fromURL(url).getLines().toSeq
+      val revision = versionInfo
+        .collect { 
+          case revisionRegex(revision) => revision 
+        }
+        .headOption
+        .getOrElse("UNKNOWN")
+      (revision, url)
+    }.groupBy(_._1).mapValues(_.map(_._2)).toMap
+    lazy val rapidsJarsVersMsg = revisionMap.map {
+      case (revision, urls) => {
+        s"revison: $revision" + urls.map {
+          url => "\n\tjar URL: " + url.toString.split("!").head + "\n\t" + 
+              scala.io.Source.fromURL(url).getLines().toSeq.mkString("\n\t")
+        }.mkString + "\n"
+      }
+    }.mkString
+    // scalastyle:off line.size.limit
+    lazy val msg = s"""Multiple $jarName jars found in the classpath:
+        |$rapidsJarsVersMsg
+        |Please make sure there is only one $jarName jar in the classpath.
+        |If it is impossible to fix the classpath you can suppress the error by setting ${RapidsConf.ALLOW_MULTIPLE_JARS.key} to SAME_REVISION or ALWAYS.
+        """.stripMargin
+    // scalastyle:on line.size.limit
+
+    conf.allowMultipleJars match {
+      case AllowMultipleJars.ALWAYS =>
+        if (revisionMap.size != 1 || revisionMap.values.exists(_.size != 1)) {
+          logWarning(msg)
+        }
+      case AllowMultipleJars.SAME_REVISION =>
+        require(revisionMap.size == 1, msg)
+        if (revisionMap.values.exists(_.size != 1)) {
+          logWarning(msg)
+        }
+      case AllowMultipleJars.NEVER =>
+        require(revisionMap.size == 1 && revisionMap.values.forall(_.size == 1), msg)
+    }
+  }
+
+  def detectMultipleJars(conf: RapidsConf): Unit = {
+    detectMultipleJar(PLUGIN_PROPS_FILENAME, "rapids-4-spark", conf)
+    detectMultipleJar(JNI_PROPS_FILENAME, "spark-rapids-jni", conf)
+    detectMultipleJar(CUDF_PROPS_FILENAME, "cudf", conf)
+  }
+
   // This assumes Apache Spark logic, if CSPs are setting defaults differently, we may need
   // to handle.
   def estimateCoresOnExec(conf: SparkConf): Int = {
@@ -311,6 +374,7 @@ class RapidsDriverPlugin extends DriverPlugin with Logging {
     val sparkConf = pluginContext.conf
     RapidsPluginUtils.fixupConfigsOnDriver(sparkConf)
     val conf = new RapidsConf(sparkConf)
+    RapidsPluginUtils.detectMultipleJars(conf)
     RapidsPluginUtils.logPluginMode(conf)
     GpuCoreDumpHandler.driverInit(sc, conf)
 
@@ -365,6 +429,9 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging {
       val numCores = RapidsPluginUtils.estimateCoresOnExec(sparkConf)
       val conf = new RapidsConf(extraConf.asScala.toMap)
 
+      // Fail if there are multiple plugin jars in the classpath.
+      RapidsPluginUtils.detectMultipleJars(conf)
+
       // Compare if the cudf version mentioned in the classpath is equal to the version which
       // plugin expects. If there is a version mismatch, throw error. This check can be disabled
       // by setting this config spark.rapids.cudfVersionOverride=true
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
index 9e7a0eb7a47..657cbb33dd0 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBuffer.scala
@@ -130,11 +130,17 @@ class ChunkedPacker(
     tableMeta
   }
 
-  override def hasNext: Boolean = {
-    !closed && chunkedPack.hasNext
+  override def hasNext: Boolean = synchronized {
+    if (closed) {
+      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
+    }
+    chunkedPack.hasNext
   }
 
-  def next(): MemoryBuffer = {
+  def next(): MemoryBuffer = synchronized {
+    if (closed) {
+      throw new IllegalStateException(s"ChunkedPacker for $id is closed")
+    }
     val bytesWritten = chunkedPack.next(bounceBuffer)
     // we increment the refcount because the caller has no idea where
     // this memory came from, so it should close it.
@@ -171,7 +177,7 @@ class RapidsBufferCopyIterator(buffer: RapidsBuffer)
     extends Iterator[MemoryBuffer] with AutoCloseable with Logging {
 
   private val chunkedPacker: Option[ChunkedPacker] = if (buffer.supportsChunkedPacker) {
-    Some(buffer.getChunkedPacker)
+    Some(buffer.makeChunkedPacker)
   } else {
     None
   }
@@ -285,7 +291,10 @@ trait RapidsBuffer extends AutoCloseable {
 
   val supportsChunkedPacker: Boolean = false
 
-  def getChunkedPacker: ChunkedPacker = {
+  /**
+   * Makes a new chunked packer. It is the responsibility of the caller to close this.
+   */
+  def makeChunkedPacker: ChunkedPacker = {
     throw new NotImplementedError("not implemented for this store")
   }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 9abaccfa0f6..c36da45dd78 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1379,8 +1379,8 @@ object RapidsConf {
   // INTERNAL TEST AND DEBUG CONFIGS
 
   val TEST_RETRY_OOM_INJECTION_ENABLED = conf("spark.rapids.sql.test.injectRetryOOM")
-    .doc("Only to be used in tests. If enabled the retry iterator will inject a RetryOOM " +
-         "once per invocation.")
+    .doc("Only to be used in tests. If enabled the retry iterator will inject a GpuRetryOOM " +
+         "or CpuRetryOOM once per invocation.")
     .internal()
     .booleanConf
     .createWithDefault(false)
@@ -1840,6 +1840,22 @@ object RapidsConf {
     .booleanConf
     .createWithDefault(false)
 
+  object AllowMultipleJars extends Enumeration {
+    val ALWAYS, SAME_REVISION, NEVER = Value
+  }
+
+  val ALLOW_MULTIPLE_JARS = conf("spark.rapids.sql.allowMultipleJars")
+    .internal()
+    .startupOnly()
+    .doc("Allow multiple rapids-4-spark, spark-rapids-jni, and cudf jars on the classpath. " +
+      "Spark will take the first one it finds, so the version may not be expected. Possisble " +
+      "values are ALWAYS: allow all jars, SAME_REVISION: only allow jars with the same " +
+      "revision, NEVER: do not allow multiple jars at all.")
+    .stringConf
+    .transform(_.toUpperCase(java.util.Locale.ROOT))
+    .checkValues(AllowMultipleJars.values.map(_.toString))
+    .createWithDefault(AllowMultipleJars.SAME_REVISION.toString)
+
   val ALLOW_DISABLE_ENTIRE_PLAN = conf("spark.rapids.allowDisableEntirePlan")
     .internal()
     .doc("The plugin has the ability to detect possibe incompatibility with some specific " +
@@ -2098,7 +2114,7 @@ object RapidsConf {
         |On startup use: `--conf [conf key]=[conf value]`. For example:
         |
         |```
-        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \
+        |${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar \
         |--conf spark.plugins=com.nvidia.spark.SQLPlugin \
         |--conf spark.rapids.sql.concurrentGpuTasks=2
         |```
@@ -2642,6 +2658,17 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val cudfVersionOverride: Boolean = get(CUDF_VERSION_OVERRIDE)
 
+  lazy val allowMultipleJars: AllowMultipleJars.Value = {
+    get(ALLOW_MULTIPLE_JARS) match {
+      case "ALWAYS" => AllowMultipleJars.ALWAYS
+      case "NEVER" => AllowMultipleJars.NEVER
+      case "SAME_REVISION" => AllowMultipleJars.SAME_REVISION
+      case other =>
+        throw new IllegalArgumentException(s"Internal Error $other is not supported for " +
+            s"${ALLOW_MULTIPLE_JARS.key}")
+    }
+  }
+
   lazy val allowDisableEntirePlan: Boolean = get(ALLOW_DISABLE_ENTIRE_PLAN)
 
   lazy val useArrowCopyOptimization: Boolean = get(USE_ARROW_OPT)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
index f0dfd4a53a6..c56806bc965 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsDeviceMemoryStore.scala
@@ -234,14 +234,6 @@ class RapidsDeviceMemoryStore(
 
     override val supportsChunkedPacker: Boolean = true
 
-    private var initializedChunkedPacker: Boolean = false
-
-    lazy val chunkedPacker: ChunkedPacker = {
-      val packer = new ChunkedPacker(id, table, chunkedPackBounceBuffer)
-      initializedChunkedPacker = true
-      packer
-    }
-
     // This is the current size in batch form. It is to be used while this
     // table hasn't migrated to another store.
     private val unpackedSizeInBytes: Long = GpuColumnVector.getTotalDeviceMemoryUsed(table)
@@ -264,17 +256,20 @@ class RapidsDeviceMemoryStore(
       table.close()
     }
 
-    override def meta: TableMeta = {
-      chunkedPacker.getMeta
+    private lazy val (cachedMeta, cachedPackedSize) = {
+      withResource(makeChunkedPacker) { cp =>
+        (cp.getMeta, cp.getTotalContiguousSize)
+      }
     }
 
+    override def meta: TableMeta = cachedMeta
+
     override val memoryUsedBytes: Long = unpackedSizeInBytes
 
-    override def getPackedSizeBytes: Long = getChunkedPacker.getTotalContiguousSize
+    override def getPackedSizeBytes: Long = cachedPackedSize
 
-    override def getChunkedPacker: ChunkedPacker = {
-      chunkedPacker
-    }
+    override def makeChunkedPacker: ChunkedPacker =
+      new ChunkedPacker(id, table, chunkedPackBounceBuffer)
 
     /**
      * Mark a column as spillable
@@ -329,10 +324,6 @@ class RapidsDeviceMemoryStore(
       // lets remove our handler from the chain of handlers for each column
       removeOnCloseEventHandler()
       super.free()
-      if (initializedChunkedPacker) {
-        chunkedPacker.close()
-        initializedChunkedPacker = false
-      }
     }
 
     private def registerOnCloseEventHandler(): Unit = {
@@ -513,6 +504,7 @@ class RapidsDeviceMemoryStore(
       }
       written
     }
+
   }
   override def close(): Unit = {
     try {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
index d0aa7c413d3..cdcdfea9715 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsHostMemoryStore.scala
@@ -122,7 +122,7 @@ class RapidsHostMemoryStore(
       withResource(other.getCopyIterator) { otherBufferIterator =>
         val isChunked = otherBufferIterator.isChunked
         val totalCopySize = otherBufferIterator.getTotalCopySize
-        closeOnExcept(HostAlloc.allocHighPriority(totalCopySize)) { hb =>
+        closeOnExcept(HostAlloc.tryAlloc(totalCopySize)) { hb =>
           hb.map { hostBuffer =>
             val spillNs = GpuTaskMetrics.get.spillToHostTime {
               var hostOffset = 0L
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
index 4655c64cbbb..dcd99baa480 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
@@ -173,8 +173,8 @@ class RapidsShuffleHeartbeatEndpoint(pluginContext: PluginContext, conf: RapidsC
         .setNameFormat("rapids-shuffle-hb")
         .setDaemon(true)
         .build(),
-        () => RmmSpark.associateCurrentThreadWithShuffle(),
-        () => RmmSpark.removeCurrentThreadAssociation()))
+        null,
+        () => RmmSpark.removeAllCurrentThreadAssociation()))
 
   private class InitializeShuffleManager(ctx: PluginContext,
       shuffleManager: RapidsShuffleInternalManagerBase) extends Runnable {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
index 8be9f37fa55..78fe8c7b31d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala
@@ -24,7 +24,7 @@ import com.nvidia.spark.Retryable
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, RmmSparkThreadState, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{CpuRetryOOM, CpuSplitAndRetryOOM, GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark, RmmSparkThreadState}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
@@ -186,33 +186,37 @@ object RmmRapidsRetryIterator extends Logging {
   }
 
   /**
-   * Returns a tuple of (shouldRetry, shouldSplit) depending the exception
+   * Returns a tuple of (shouldRetry, shouldSplit, isFromGpuOom) depending the exception
    * passed
    */
-  private def isRetryOrSplitAndRetry(ex: Throwable): (Boolean, Boolean) = {
+  private def isRetryOrSplitAndRetry(ex: Throwable): (Boolean, Boolean, Boolean) = {
     ex match {
-      case _: RetryOOM => (true, false)
-      case _: SplitAndRetryOOM => (true, true)
-      case _ => (false, false)
+      case _: GpuRetryOOM => (true, false, true)
+      case _: CpuRetryOOM => (true, false, false)
+      case _: GpuSplitAndRetryOOM => (true, true, true)
+      case _: CpuSplitAndRetryOOM => (true, true, false)
+      case _ => (false, false, false)
     }
   }
 
   /**
-   * Returns a tuple of (causedByRetry, causedBySplit) depending the exception
+   * Returns a tuple of (causedByRetry, causedBySplit, ifFromGpuoom) depending the exception
    * passed
    */
-  private def causedByRetryOrSplit(ex: Throwable): (Boolean, Boolean) = {
+  private def causedByRetryOrSplit(ex: Throwable): (Boolean, Boolean, Boolean) = {
     var current = ex
     var causedByRetry = false
     var causedBySplit = false
+    var isFromGpuOom = false
     // check if there is a hidden retry or split OOM
     while (current != null && !causedByRetry) {
       current = current.getCause()
-      val (isRetry, isSplit) = isRetryOrSplitAndRetry(current)
+      val (isRetry, isSplit, isGpuOom) = isRetryOrSplitAndRetry(current)
       causedByRetry = isRetry
       causedBySplit = causedBySplit || isSplit
+      isFromGpuOom = isGpuOom
     }
-    (causedByRetry, causedBySplit)
+    (causedByRetry, causedBySplit, isFromGpuOom)
   }
 
   private def isColumnSizeOverflow(ex: Throwable): Boolean =
@@ -242,7 +246,7 @@ object RmmRapidsRetryIterator extends Logging {
     } catch {
       case ex: Throwable =>
         // Only restore on retry exceptions
-        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        val (topLevelIsRetry, _, _) = isRetryOrSplitAndRetry(ex)
         if (topLevelIsRetry || causedByRetryOrSplit(ex)._1 || isOrCausedByColumnSizeOverflow(ex)) {
           r.restore()
         }
@@ -269,7 +273,7 @@ object RmmRapidsRetryIterator extends Logging {
     } catch {
       case ex: Throwable =>
         // Only restore on retry exceptions
-        val (topLevelIsRetry, _) = isRetryOrSplitAndRetry(ex)
+        val (topLevelIsRetry, _, _) = isRetryOrSplitAndRetry(ex)
         if (topLevelIsRetry || causedByRetryOrSplit(ex)._1 || isOrCausedByColumnSizeOverflow(ex)) {
           r.foreach(_.restore())
         }
@@ -342,14 +346,17 @@ object RmmRapidsRetryIterator extends Logging {
     override def hasNext: Boolean
 
     /**
-     * Split is a function that is invoked by `RmmRapidsRetryIterator` when `SplitAndRetryOOM`
+     * Split is a function that is invoked by `RmmRapidsRetryIterator` when `GpuSplitAndRetryOOM`
+     * or `CpuSplitAndRetryOOM`
      * is thrown. This function is implemented by `Spliterator` classes to attempt to handle
      * this exception by reducing the size of attempts (the thing that `.next` is
      * using as an input), usually by splitting a batch in half by number of rows, or
      * splitting a collection of batches into smaller collections to be attempted separately,
      * likely reducing GPU memory that needs to be manifested while calling `.next`.
+     * @param isFromGpuOom true if the split happened because of a GPU OOM. Otherwise it was a
+     *                     CPU off heap OOM.
      */
-    def split(): Unit
+    def split(isFromGpuOom: Boolean): Unit
 
     override def next(): K
 
@@ -367,8 +374,12 @@ object RmmRapidsRetryIterator extends Logging {
 
     override def hasNext: Boolean = !wasCalledSuccessfully
 
-    override def split(): Unit = {
-      throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+    override def split(isFromGpuOom: Boolean): Unit = {
+      if (isFromGpuOom) {
+        throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+      } else {
+        throw new CpuSplitAndRetryOOM("CPU OutOfMemory: could not split inputs and retry")
+      }
     }
 
     override def next(): K = {
@@ -431,12 +442,16 @@ object RmmRapidsRetryIterator extends Logging {
 
     override def hasNext: Boolean = input.hasNext || attemptStack.nonEmpty
 
-    override def split(): Unit = {
+    override def split(isFromGpuOom: Boolean): Unit = {
       // If `split` OOMs, we are already the last thread standing
       // there is likely not much we can do, and for now we don't handle
       // this OOM
       if (splitPolicy == null) {
-        throw new SplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+        if (isFromGpuOom) {
+          throw new GpuSplitAndRetryOOM("GPU OutOfMemory: could not split inputs and retry")
+        } else {
+          throw new CpuSplitAndRetryOOM("CPU OutOfMemory: could not split inputs and retry")
+        }
       }
       // splitPolicy must take ownership of the argument
       val splitted = splitPolicy(attemptStack.pop())
@@ -513,8 +528,10 @@ object RmmRapidsRetryIterator extends Logging {
    */
   class RmmRapidsRetryIterator[T, K](attemptIter: Spliterator[K])
       extends Iterator[K] {
+    // We want to be sure that retry will work in all cases
+    TaskRegistryTracker.registerThreadForRetry()
     // used to figure out if we should inject an OOM (only for tests)
-    private val config = new RapidsConf(SQLConf.get)
+    private val config = Option(SQLConf.get).map(new RapidsConf(_))
 
     // this is true if an OOM was injected (only for tests)
     private var injectedOOM = false
@@ -526,7 +543,8 @@ object RmmRapidsRetryIterator extends Logging {
     private def clearInjectedOOMIfNeeded(): Unit = {
       if (injectedOOM && !injectedOOMCleared) {
         val threadId = RmmSpark.getCurrentThreadId
-        // if for some reason we don't throw, or we throw something that isn't a RetryOOM
+        // if for some reason we don't throw, or we throw something that isn't a GpuRetryOOM
+        // or CpuRetryOOM
         // we want to remove the retry we registered before we leave the withRetry block.
         // If the thread is in an UNKNOWN state, then it is already cleared.
         if (RmmSpark.getStateOf(threadId) != RmmSparkThreadState.UNKNOWN) {
@@ -543,27 +561,33 @@ object RmmRapidsRetryIterator extends Logging {
       var firstAttempt: Boolean = true
       var result: Option[K] = None
       var doSplit = false
+      var isFromGpuOom = true
       while (result.isEmpty && attemptIter.hasNext) {
         if (!firstAttempt) {
           // call thread block API
           try {
             RmmSpark.blockThreadUntilReady()
           } catch {
-            case _: SplitAndRetryOOM => doSplit = true
+            case _: GpuSplitAndRetryOOM =>
+              doSplit = true
+              isFromGpuOom = true
+            case _: CpuSplitAndRetryOOM =>
+              doSplit = true
+              isFromGpuOom = false
           }
         }
         firstAttempt = false
         if (doSplit) {
-          attemptIter.split()
+          attemptIter.split(isFromGpuOom)
         }
         doSplit = false
         try {
           // call the user's function
-          if (config.testRetryOOMInjectionEnabled && !injectedOOM) {
+          if (config.exists(_.testRetryOOMInjectionEnabled) && !injectedOOM) {
             injectedOOM = true
             // ensure we have associated our thread with the running task, as
             // `forceRetryOOM` requires a prior association.
-            RmmSpark.associateCurrentThreadWithTask(TaskContext.get().taskAttemptId())
+            RmmSpark.currentThreadIsDedicatedToTask(TaskContext.get().taskAttemptId())
             RmmSpark.forceRetryOOM(RmmSpark.getCurrentThreadId)
           }
           result = Some(attemptIter.next())
@@ -571,15 +595,17 @@ object RmmRapidsRetryIterator extends Logging {
         } catch {
           case ex: Throwable =>
             // handle a retry as the top-level exception
-            val (topLevelIsRetry, topLevelIsSplit) = isRetryOrSplitAndRetry(ex)
+            val (topLevelIsRetry, topLevelIsSplit, isGpuOom) = isRetryOrSplitAndRetry(ex)
             doSplit = topLevelIsSplit
+            isFromGpuOom = isGpuOom
 
             // handle any retries that are wrapped in a different top-level exception
             var causedByRetry = false
             if (!topLevelIsRetry) {
-              val (cbRetry, cbSplit) = causedByRetryOrSplit(ex)
+              val (cbRetry, cbSplit, isGpuOom) = causedByRetryOrSplit(ex)
               causedByRetry = cbRetry
               doSplit = doSplit || cbSplit
+              isFromGpuOom = isGpuOom
             }
 
             clearInjectedOOMIfNeeded()
@@ -614,7 +640,8 @@ object RmmRapidsRetryIterator extends Logging {
   /**
    * Common split function from a single SpillableColumnarBatch to a sequence of them,
    * that tries to split the input into two chunks. If the input cannot be split in two,
-   * because we are down to 1 row, this function throws `SplitAndRetryOOM`.
+   * because we are down to 1 row, this function throws `GpuSplitAndRetryOOM` or
+   * `CpuSplitAndRetryOOM`.
    *
    * Note how this function closes the input `spillable` that is passed in.
    *
@@ -625,7 +652,7 @@ object RmmRapidsRetryIterator extends Logging {
       withResource(spillable) { _ =>
         val toSplitRows = spillable.numRows()
         if (toSplitRows <= 1) {
-          throw new SplitAndRetryOOM(
+          throw new GpuSplitAndRetryOOM(
             s"GPU OutOfMemory: a batch of $toSplitRows cannot be split!")
         }
         val (firstHalf, secondHalf) = withResource(spillable.getColumnarBatch()) { src =>
@@ -665,7 +692,7 @@ object RmmRapidsRetryIterator extends Logging {
       withResource(target) { _ =>
         val newTarget = target.targetSize / 2
         if (newTarget < target.minSize) {
-          throw new SplitAndRetryOOM(
+          throw new GpuSplitAndRetryOOM(
             s"GPU OutOfMemory: targetSize: ${target.targetSize} cannot be split further!" +
                 s" minimum: ${target.minSize}")
         }
@@ -677,9 +704,9 @@ object RmmRapidsRetryIterator extends Logging {
 /**
  * This is a wrapper that turns a target size into an autocloseable to allow it to be used
  * in withRetry blocks.  It is intended to be used to help with cases where the split calculation
- * happens inside the retry block, and depends on the target size.  On a SplitAndRetryOOM,
- * a split policy like `splitTargetSizeInHalf` can be used to retry the block with a smaller target
- * size.
+ * happens inside the retry block, and depends on the target size.  On a `GpuSplitAndRetryOOM` or
+ * `CpuSplitAndRetryOOM`, a split policy like `splitTargetSizeInHalf` can be used to retry the
+ * block with a smaller target size.
  */
 case class AutoCloseableTargetSize(targetSize: Long, minSize: Long) extends AutoCloseable {
   override def close(): Unit = ()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala
new file mode 100644
index 00000000000..d768f727cec
--- /dev/null
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TaskRegistryTracker.scala
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.jni.RmmSpark
+import java.util
+
+import org.apache.spark.TaskContext
+
+/**
+ * This handles keeping track of task threads and registering them with RMMSpark
+ * as needed. This is here to provide an efficient and lazy way to make sure that
+ * we can use the Retry API behind the scenes without having to have callbacks
+ * whenever a task starts, or trying to inject code in all of the operators that
+ * would first start on the GPU.
+ */
+object TaskRegistryTracker {
+  private val taskToThread = new util.HashMap[Long, ArrayBuffer[Long]]()
+  private val registeredThreads = new util.HashSet[Long]()
+
+  /**
+   * Clear the registry. This is used for tests
+   */
+  def clearRegistry(): Unit = synchronized {
+    val copied = new java.util.HashSet(taskToThread.keySet())
+    copied.forEach { taskId =>
+      taskIsDone(taskId)
+    }
+  }
+
+  private def taskIsDone(taskId: Long): Unit = synchronized {
+    val threads = taskToThread.remove(taskId)
+    if (threads != null) {
+      threads.foreach(registeredThreads.remove)
+      RmmSpark.taskDone(taskId)
+    }
+  }
+
+  def registerThreadForRetry(): Unit = synchronized {
+    val tc = TaskContext.get()
+    if (tc != null) {
+      // If we don't have a TaskContext we are either in a test or in some other thread
+      // If it is some other thread, then they are responsible to amke sure things are
+      // registered properly themselves. If it is a test, well you need to update your
+      // test code to make this work properly.
+      val threadId = RmmSpark.getCurrentThreadId
+      val taskId = tc.taskAttemptId()
+      if (registeredThreads.add(threadId)) {
+        RmmSpark.currentThreadIsDedicatedToTask(taskId)
+        if (!taskToThread.containsKey(taskId)) {
+          taskToThread.put(taskId, ArrayBuffer(threadId))
+          ScalableTaskCompletion.onTaskCompletion(tc) {
+            taskIsDone(taskId)
+          }
+        } else {
+          taskToThread.get(taskId) += threadId
+        }
+      }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
index c7cb9e88b52..0852fb77ccc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/basicPhysicalOperators.scala
@@ -26,7 +26,7 @@ import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRestoreOnRetry, withRetry, withRetryNoSplit}
-import com.nvidia.spark.rapids.jni.SplitAndRetryOOM
+import com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM
 import com.nvidia.spark.rapids.shims._
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
@@ -1097,7 +1097,7 @@ private[rapids] class GpuRangeIterator(
     (rowsNumber) => {
       withResource(rowsNumber) { _ =>
         if (rowsNumber.value < 10) {
-          throw new SplitAndRetryOOM(s"GPU OutOfMemory: the number of rows generated is" +
+          throw new GpuSplitAndRetryOOM(s"GPU OutOfMemory: the number of rows generated is" +
             s" too small to be split ${rowsNumber.value}!")
         }
         Seq(AutoCloseableLong(rowsNumber.value / 2))
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
index c5fba7cfc6a..6b887fb1765 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/BufferReceiveState.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, Cuda, DeviceMemoryBuffer, NvtxColor, NvtxRange, Rmm}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.format.TableMeta
+import com.nvidia.spark.rapids.jni.RmmSpark
 
 import org.apache.spark.internal.Logging
 
@@ -177,7 +178,7 @@ class BufferReceiveState(
       closeOnExcept(new ArrayBuffer[DeviceMemoryBuffer]()) { toClose =>
         val results = currentBlocks.flatMap { b =>
           val pendingTransferRequest = b.block.request
-
+          RmmSpark.shuffleThreadWorkingOnTasks(pendingTransferRequest.handler.getTaskIds)
           val fullSize = pendingTransferRequest.tableMeta.bufferMeta().size()
 
           var contigBuffer: DeviceMemoryBuffer = null
@@ -233,6 +234,10 @@ class BufferReceiveState(
         // unless all that data has truly moved to our final buffer in our stream
         stream.sync()
 
+        results.foreach { result =>
+          RmmSpark.poolThreadFinishedForTasks(result.handler.getTaskIds)
+        }
+
         // cpu is in sync, we can recycle the bounce buffer
         if (!toFinalize.isEmpty) {
           val firstCb = toFinalize.pop()
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
index 0e6e27bef40..b73f9820bad 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClient.scala
@@ -56,6 +56,11 @@ trait RapidsShuffleFetchHandler {
    * @param errorMessage - a string containing an error message
    */
   def transferError(errorMessage: String, throwable: Throwable = null): Unit
+
+  /**
+   * Called to get the task attempt ids that this shuffle handler is for.
+   */
+  def getTaskIds: Array[Long]
 }
 
 /**
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
index b97415b31ba..132c46b9ba7 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/BatchGroupUtils.scala
@@ -22,12 +22,14 @@ import ai.rapids.cudf
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.RmmRapidsRetryIterator.withRetryNoSplit
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.rapids.execution.GpuSubPartitionHashJoin
 import org.apache.spark.sql.rapids.execution.python.shims.GpuPythonArrowOutput
 import org.apache.spark.sql.rapids.shims.DataTypeUtilsShim
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -398,34 +400,68 @@ class CombiningIterator(
     numOutputRows: GpuMetric,
     numOutputBatches: GpuMetric) extends Iterator[ColumnarBatch] {
 
-  // For `hasNext` we are waiting on the queue to have something inserted into it
-  // instead of waiting for a result to be ready from Python. The reason for this
-  // is to let us know the target number of rows in the batch that we want when reading.
-  // It is a bit hacked up but it works. In the future when we support spilling we should
-  // store the number of rows separate from the batch. That way we can get the target batch
-  // size out without needing to grab the GpuSemaphore which we cannot do if we might block
-  // on a read operation.
-  override def hasNext: Boolean = inputBatchQueue.hasNext || pythonOutputIter.hasNext
+  // This is only for the input.
+  private var pendingInput: Option[SpillableColumnarBatch] = None
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(pendingInput.foreach(_.close())))
+
+  // The Python output should line up row for row so we only look at the Python output
+  // iterator and no need to check the `inputPending` who will be consumed when draining
+  // the Python output.
+  override def hasNext: Boolean = pythonOutputIter.hasNext
 
   override def next(): ColumnarBatch = {
-    val numRows = inputBatchQueue.peekBatchSize
+    val numRows = inputBatchQueue.peekBatchNumRows()
     // Updates the expected batch size for next read
-    pythonArrowReader.setMinReadTargetBatchSize(numRows)
+    pythonArrowReader.setMinReadTargetNumRows(numRows)
     // Reads next batch from Python and combines it with the input batch by the left side.
     withResource(pythonOutputIter.next()) { cbFromPython =>
-      assert(cbFromPython.numRows() == numRows)
-      withResource(inputBatchQueue.remove()) { origBatch =>
+      // Here may get a batch has a larger rows number than the current input batch.
+      assert(cbFromPython.numRows() >= numRows,
+        s"Expects >=$numRows rows but got ${cbFromPython.numRows()} from the Python worker")
+      withResource(concatInputBatch(cbFromPython.numRows())) { concated =>
         numOutputBatches += 1
         numOutputRows += numRows
-        combine(origBatch, cbFromPython)
+        GpuColumnVector.combineColumns(concated, cbFromPython)
       }
     }
   }
 
-  private def combine(lBatch: ColumnarBatch, rBatch: ColumnarBatch): ColumnarBatch = {
-    val lColumns = GpuColumnVector.extractColumns(lBatch).map(_.incRefCount())
-    val rColumns = GpuColumnVector.extractColumns(rBatch).map(_.incRefCount())
-    new ColumnarBatch(lColumns ++ rColumns, lBatch.numRows())
+  private def concatInputBatch(targetNumRows: Int): ColumnarBatch = {
+    withResource(mutable.ArrayBuffer[SpillableColumnarBatch]()) { buf =>
+      var curNumRows = pendingInput.map(_.numRows()).getOrElse(0)
+      pendingInput.foreach(buf.append(_))
+      pendingInput = None
+      while (curNumRows < targetNumRows) {
+        val scb = inputBatchQueue.remove()
+        if (scb != null) {
+          buf.append(scb)
+          curNumRows = curNumRows + scb.numRows()
+        }
+      }
+      assert(buf.nonEmpty, "The input queue is empty")
+
+      if (curNumRows > targetNumRows) {
+        // Need to split the last batch
+        val Array(first, second) = withRetryNoSplit(buf.remove(buf.size - 1)) { lastScb =>
+          val splitIdx = lastScb.numRows() - (curNumRows - targetNumRows)
+          withResource(lastScb.getColumnarBatch()) { lastCb =>
+            val batchTypes = GpuColumnVector.extractTypes(lastCb)
+            withResource(GpuColumnVector.from(lastCb)) { table =>
+              table.contiguousSplit(splitIdx).safeMap(
+                SpillableColumnarBatch(_, batchTypes, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+            }
+          }
+        }
+        buf.append(first)
+        pendingInput = Some(second)
+      }
+
+      val ret = GpuSubPartitionHashJoin.concatSpillBatchesAndClose(buf.toSeq)
+      // "ret" should be non empty because we checked the buf is not empty ahead.
+      withResource(ret.get) { concatedScb =>
+        concatedScb.getColumnarBatch()
+      }
+    } // end of withResource(mutable.ArrayBuffer)
   }
 
 }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
index 9c02d231706..caf323ec053 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala
@@ -20,9 +20,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
-import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import com.nvidia.spark.rapids.shims.ShimUnaryExecNode
 
@@ -141,9 +140,7 @@ case class GpuAggregateInPandasExec(
 
     // Start processing
     child.executeColumnar().mapPartitionsInternal { inputIter =>
-      val queue: BatchQueue = new BatchQueue()
       val context = TaskContext.get()
-      onTaskCompletion(queue.close())
 
       if (isPythonOnGpuEnabled) {
         GpuPythonHelper.injectGpuInfo(pyFuncs, isPythonOnGpuEnabled)
@@ -164,51 +161,56 @@ case class GpuAggregateInPandasExec(
       }
 
       // Second splits into separate group batches.
-      val miniAttrs = gpuGroupingExpressions ++ allInputs
-      val pyInputIter = BatchGroupedIterator(miniIter, miniAttrs.asInstanceOf[Seq[Attribute]],
-          groupingRefs.indices)
-        .map { groupedBatch =>
-          // Resolves the group key and the python input from a grouped batch. Then
-          //  - Caches the key to be combined with the Python output later. And
-          //  - Returns the python input to be sent to Python later.
-          withResource(groupedBatch) { grouped =>
-            // key batch.
-            // No `safeMap` because here does not increase the ref count.
-            // (`Seq.indices.map()` is NOT lazy, so it is safe to be used to slice the columns.)
-            val keyCudfColumns = groupingRefs.indices.map(
-              grouped.column(_).asInstanceOf[GpuColumnVector].getBase)
-            val keyBatch = if (keyCudfColumns.isEmpty) {
-              // No grouping columns, then the whole batch is a group. Returns the dedicated batch
-              // as the group key.
-              // This batch means there is only one empty row, just like the 'new UnsafeRow()'
-              // used in Spark. The row number setting to 1 is because Python returns only one row
-              // as the aggregate result for the whole batch, and 'CombiningIterator' requires the
-              // the same row number for both the key batch and the result batch to be combined.
-              new ColumnarBatch(Array(), 1)
-            } else {
-              // Uses `cudf.Table.gather` to pick the first row in each group as the group key.
-              // Doing this is because
-              //   - The Python worker produces only one row as the aggregate result,
-              //   - The key rows in a group are equal to each other.
-              //
-              // (Now this is done group by group, so the performance would not be good when
-              //  there are too many small groups.)
-              withResource(new cudf.Table(keyCudfColumns: _*)) { table =>
-                withResource(cudf.ColumnVector.fromInts(0)) { gatherMap =>
-                  withResource(table.gather(gatherMap)) { oneRowTable =>
-                    GpuColumnVector.from(oneRowTable, groupingRefs.map(_.dataType).toArray)
-                  }
-                }
+      val miniAttrs = (gpuGroupingExpressions ++ allInputs).asInstanceOf[Seq[Attribute]]
+      val keyConverter = (groupedBatch: ColumnarBatch) => {
+        // No `safeMap` because here does not increase the ref count.
+        // (`Seq.indices.map()` is NOT lazy, so it is safe to be used to slice the columns.)
+        val keyCudfColumns = groupingRefs.indices.map(
+          groupedBatch.column(_).asInstanceOf[GpuColumnVector].getBase)
+        if (keyCudfColumns.isEmpty) {
+          // No grouping columns, then the whole batch is a group. Returns the dedicated batch
+          // as the group key.
+          // This batch means there is only one empty row, just like the 'new UnsafeRow()'
+          // used in Spark. The row number setting to 1 is because Python returns only one row
+          // as the aggregate result for the whole batch, and 'CombiningIterator' requires the
+          // the same row number for both the key batch and the result batch to be combined.
+          new ColumnarBatch(Array(), 1)
+        } else {
+          // Uses `cudf.Table.gather` to pick the first row in each group as the group key.
+          // Doing this is because
+          //   - The Python worker produces only one row as the aggregate result,
+          //   - The key rows in a group are equal to each other.
+          //
+          // (Now this is done group by group, so the performance would not be good when
+          //  there are too many small groups.)
+          withResource(new cudf.Table(keyCudfColumns: _*)) { table =>
+            withResource(cudf.ColumnVector.fromInts(0)) { gatherMap =>
+              withResource(table.gather(gatherMap)) { oneRowTable =>
+                GpuColumnVector.from(oneRowTable, groupingRefs.map(_.dataType).toArray)
               }
             }
-            queue.add(keyBatch)
+          }
+        }
+      }
 
-            // Python input batch
-            val pyInputColumns = pyInputRefs.indices.safeMap { idx =>
-              grouped.column(idx + groupingRefs.size).asInstanceOf[GpuColumnVector].incRefCount()
-            }
-            new ColumnarBatch(pyInputColumns.toArray, groupedBatch.numRows())
+      val batchProducer = new BatchProducer(
+        BatchGroupedIterator(miniIter, miniAttrs, groupingRefs.indices))
+      val queue = new BatchQueue(batchProducer, Some(keyConverter))
+      val pyInputIter = batchProducer.asIterator.map { case (batch, isForPeek) =>
+        val inputBatch = closeOnExcept(batch) { _ =>
+          val pyInputColumns = pyInputRefs.indices.safeMap { idx =>
+            batch.column(idx + groupingRefs.size).asInstanceOf[GpuColumnVector].incRefCount()
           }
+          new ColumnarBatch(pyInputColumns.toArray, batch.numRows())
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // When adding batch to the queue, queue will convert it to a key batch because this
+          // queue is constructed with the key converter.
+          queue.add(batch)
+        }
+        inputBatch
       }
 
       // Third, sends to Python to execute the aggregate and returns the result.
@@ -223,8 +225,7 @@ case class GpuAggregateInPandasExec(
           pythonRunnerConf,
           // The whole group data should be written in a single call, so here is unlimited
           Int.MaxValue,
-          DataTypeUtilsShim.fromAttributes(pyOutAttributes),
-          () => queue.finish())
+          DataTypeUtilsShim.fromAttributes(pyOutAttributes))
 
         val pyOutputIterator = pyRunner.compute(pyInputIter, context.partitionId(), context)
 
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
index 45fec7c81d2..60b6b3929e1 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala
@@ -24,7 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf._
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -171,62 +171,143 @@ class RebatchingRoundoffIterator(
 }
 
 /**
- * A simple queue that holds the pending batches that need to line up with
- * and combined with batches coming back from python
+ * Work with BatchQueue to support BatchQueue's peek operation by pulling
+ * in a batch from the input iterator on demand.
+ *
+ * It also supports accessing batches from the input by an iterator. Call
+ * "asIterator" to get the iterator. This iterator will return a tuple of
+ * ColumnarBatch and Boolean. And the boolean indicates whether the batch
+ * is pulled in for peak.
  */
-class BatchQueue extends AutoCloseable {
-  private val queue: mutable.Queue[SpillableColumnarBatch] =
-    mutable.Queue[SpillableColumnarBatch]()
-  private var isSet = false
-
-  def add(batch: ColumnarBatch): Unit = synchronized {
-    queue.enqueue(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
-    if (!isSet) {
-      // Wake up anyone waiting for the first batch.
-      isSet = true
-      notifyAll()
+class BatchProducer(input: Iterator[ColumnarBatch]) extends AutoCloseable { producer =>
+
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(close()))
+
+  // Cache for batches pulled in by the "produce" call for the peek operation.
+  // In fact, there is usually only one batch. But using a queue here is because in
+  // theory "produce" can be called multiple times, then more than one batch can be
+  // pulled in.
+  private val pending = mutable.Queue[SpillableColumnarBatch]()
+
+  private[rapids] def produce(): ColumnarBatch = producer.synchronized {
+    if (input.hasNext) {
+      val cb = input.next()
+      // Need to duplicate this batch for "next"
+      pending.enqueue(SpillableColumnarBatch(GpuColumnVector.incRefCounts(cb),
+        SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+      cb
+    } else {
+      null
     }
   }
 
-  def finish(): Unit = synchronized {
-    if (!isSet) {
-      // Wake up anyone waiting for the first batch.
-      isSet = true
-      notifyAll()
+  def asIterator: Iterator[(ColumnarBatch, Boolean)] = {
+    new Iterator[(ColumnarBatch, Boolean)] {
+
+      override def hasNext: Boolean = producer.synchronized {
+        pending.nonEmpty || input.hasNext
+      }
+
+      override def next(): (ColumnarBatch, Boolean) = producer.synchronized {
+        if (!hasNext) {
+          throw new NoSuchElementException()
+        }
+        if (pending.nonEmpty) {
+          withResource(pending.dequeue()) { scb =>
+            (scb.getColumnarBatch(), true)
+          }
+        } else {
+          (input.next(), false)
+        }
+      }
+    }
+  }
+
+  override def close(): Unit = synchronized {
+    while(pending.nonEmpty) {
+      pending.dequeue().close()
     }
   }
+}
 
-  def remove(): ColumnarBatch = synchronized {
+/**
+ * A simple queue that holds the pending batches that need to line up with
+ * and combined with batches coming back from python.
+ *
+ * It will ask for a batch from "batchProducer" when peeking the rows number
+ * and the queue is empty.
+ * It also supports an optional converter to convert the input batch and save
+ * the converted batch. This is design for the GpuAggregateInPandasExec to save
+ * the group key instead of the original input batch.
+ */
+class BatchQueue(
+    batchProducer: BatchProducer,
+    converter: Option[ColumnarBatch => ColumnarBatch] = None
+) extends AutoCloseable {
+
+  assert(batchProducer != null, "BatchQueue requires a BatchProducer")
+  Option(TaskContext.get()).foreach(onTaskCompletion(_)(close()))
+
+  private val queue = mutable.ArrayBuffer[SpillableColumnarBatch]()
+
+  private[this] def convertIfAny(batch: ColumnarBatch): ColumnarBatch = {
+    converter.map { convert =>
+      withResource(batch)(convert)
+    }.getOrElse(batch)
+  }
+
+  /** Add a batch to the queue, the input batch will be taken over, do not use it anymore */
+  def add(batch: ColumnarBatch): Unit = {
+    val cb = convertIfAny(batch)
+    this.synchronized {
+      queue.append(SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+    }
+  }
+
+  /** Return and remove the first batch in the cache. */
+  def remove(): SpillableColumnarBatch = synchronized {
     if (queue.isEmpty) {
       null
     } else {
-      withResource(queue.dequeue()) { scp =>
-        scp.getColumnarBatch()
-      }
+      queue.remove(0)
     }
   }
 
-  def hasNext: Boolean = synchronized {
-    if (!isSet) {
-      wait()
+  /** Get the number of rows in the next batch, without actually getting the batch. */
+  def peekBatchNumRows(): Int = {
+    val isEmpty = this.synchronized {
+      queue.isEmpty
+    }
+    if (isEmpty) {
+      // Try to ask for the next batch instead of waiting for inserting a
+      // batch by the python runner's writing. Because the writing may
+      // happen after this peak in the single threaded python runner, leading
+      // to a hang.
+      // Do not call it inside a lock to avoid any dead lock.
+      val nextBatch = batchProducer.produce()
+      if (nextBatch != null) {
+        val cb = convertIfAny(nextBatch)
+        this.synchronized {
+          // Since we release the lock for some time, it is possible some batches
+          // have been added into the queue. Then we need to make sure this batch
+          // is the first one.
+          queue.insert(0, SpillableColumnarBatch(cb, SpillPriorities.ACTIVE_ON_DECK_PRIORITY))
+        }
+      }
     }
-    queue.nonEmpty
-  }
 
-  /**
-   * Get the number of rows in the next batch, without actually getting the batch.
-   */
-  def peekBatchSize: Int = synchronized {
-    queue.head.numRows()
+    this.synchronized {
+      if (queue.nonEmpty) {
+        queue.head.numRows()
+      } else {
+        0 // Should not go here but just in case.
+      }
+    }
   }
 
   override def close(): Unit = synchronized {
-    if (!isSet) {
-      isSet = true
-      notifyAll()
-    }
-    while(queue.nonEmpty) {
-      queue.dequeue().close()
+    while (queue.nonEmpty) {
+      queue.remove(0).close()
     }
   }
 }
@@ -285,10 +366,7 @@ case class GpuArrowEvalPythonExec(
 
     val inputRDD = child.executeColumnar()
     inputRDD.mapPartitions { iter =>
-      val queue: BatchQueue = new BatchQueue()
       val context = TaskContext.get()
-      onTaskCompletion(context)(queue.close())
-
       val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip
 
       // Not sure why we are doing this in every task.  It is not going to change, but it might
@@ -318,13 +396,21 @@ case class GpuArrowEvalPythonExec(
       }.toArray)
 
       val boundReferences = GpuBindReferences.bindReferences(allInputs.toSeq, childOutput)
-      val batchedIterator = new RebatchingRoundoffIterator(iter, inputSchema, targetBatchSize,
-        numInputRows, numInputBatches)
-      val pyInputIterator = batchedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new RebatchingRoundoffIterator(iter, inputSchema, targetBatchSize, numInputRows,
+          numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val ret = GpuProjectExec.project(batch, boundReferences)
-        queue.add(batch)
+        val ret = closeOnExcept(batch)(GpuProjectExec.project(_, boundReferences))
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
+        }
         ret
       }
 
@@ -342,8 +428,7 @@ case class GpuArrowEvalPythonExec(
           timeZone,
           runnerConf,
           targetBatchSize,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
index b323ac62843..7eb1803bf17 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowPythonRunner.scala
@@ -29,10 +29,11 @@ import org.apache.arrow.vector.ipc.ArrowStreamWriter
 
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python._
+import org.apache.spark.internal.Logging
 import org.apache.spark.rapids.shims.api.python.ShimBasePythonRunner
 import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.execution.python.shims.{GpuArrowPythonRunner, GpuPythonArrowOutput}
+import org.apache.spark.sql.rapids.execution.python.shims.GpuPythonArrowOutput
 import org.apache.spark.sql.rapids.shims.ArrowUtilsShim
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.ArrowUtils
@@ -100,8 +101,7 @@ abstract class GpuArrowPythonRunnerBase(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuPythonRunnerBase[ColumnarBatch](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
 
@@ -119,10 +119,12 @@ abstract class GpuArrowPythonRunnerBase(
       env: SparkEnv,
       inputIterator: Iterator[ColumnarBatch],
       partitionIndex: Int,
-      context: TaskContext) {
+      context: TaskContext) extends Logging {
 
-    def writeCommand(dataOut: DataOutputStream): Unit = {
+    private[this] var tableWriter: TableWriter = _
+    private[this] lazy val isInputNonEmpty = inputIterator.nonEmpty
 
+    def writeCommand(dataOut: DataOutputStream): Unit = {
       // Write config for the worker as a number of key -> value pairs of strings
       dataOut.writeInt(conf.size)
       for ((k, v) <- conf) {
@@ -133,25 +135,67 @@ abstract class GpuArrowPythonRunnerBase(
       PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
     }
 
-    def writeInputToStream(dataOut: DataOutputStream): Boolean = {
-      if (inputIterator.nonEmpty) {
-        writeNonEmptyIteratorOnGpu(dataOut)
-      } else { // Partition is empty.
-        // In this case CPU will still send the schema to Python workers by calling
-        // the "start" API of the Java Arrow writer, but GPU will send out nothing,
-        // leading to the IPC error. And it is not easy to do as what Spark does on
-        // GPU, because the C++ Arrow writer used by GPU will only send out the schema
-        // iff there is some data. Besides, it does not expose a "start" API to do this.
-        // So here we leverage the Java Arrow writer to do similar things as Spark.
-        // It is OK because sending out schema has nothing to do with GPU.
+    /**
+     * Write all the batches into stream in one time for two-threaded PythonRunner.
+     * This will be called only once.
+     */
+    def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
+      if (isInputNonEmpty) {
+        initTableWriter(dataOut)
+        logDebug("GpuPythonRunner starts to write all batches to the stream.")
+        Utils.tryWithSafeFinally {
+          while (inputIterator.hasNext) {
+            writeBatchToStreamAndClose(inputIterator.next())
+          }
+        } {
+          dataOut.flush()
+          close()
+        }
+      } else {
+        logDebug("GpuPythonRunner writes nothing to stream because the input is empty.")
         writeEmptyIteratorOnCpu(dataOut)
-        // Returning false because nothing was written
+        // The iterator can grab the semaphore even on an empty batch
+        GpuSemaphore.releaseIfNecessary(TaskContext.get())
+      }
+      logDebug("GpuPythonRunner writing is done.")
+    }
+
+    /**
+     * Write one batch each time for the singled-threaded PythonRunner.
+     * This will be called multiple times when returning a true.
+     * See https://issues.apache.org/jira/browse/SPARK-44705
+     */
+    def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
+      if (isInputNonEmpty) {
+        initTableWriter(dataOut)
+        try {
+          if (inputIterator.hasNext) {
+            logDebug("GpuPythonRunner[single-threaded] write a batch to the stream.")
+            writeBatchToStreamAndClose(inputIterator.next())
+            dataOut.flush()
+            true
+          } else { // all batches are written, close the writer
+            logDebug("GpuPythonRunner[single-threaded] writing is done.")
+            close()
+            false
+          }
+        } catch {
+          case t: Throwable =>
+            close()
+            throw t
+        }
+      } else {
+        logDebug("GpuPythonRunner[single-threaded] writes nothing to stream because" +
+          " the input is empty.")
+        writeEmptyIteratorOnCpu(dataOut)
+        // The iterator can grab the semaphore even on an empty batch
+        GpuSemaphore.releaseIfNecessary(TaskContext.get())
         false
       }
     }
 
-    private def writeNonEmptyIteratorOnGpu(dataOut: DataOutputStream): Boolean = {
-      val writer = {
+    private def initTableWriter(dataOut: DataOutputStream): Unit = {
+      if (tableWriter == null) {
         val builder = ArrowIPCWriterOptions.builder()
         builder.withMaxChunkSize(batchSize)
         builder.withCallback((table: Table) => {
@@ -159,40 +203,44 @@ abstract class GpuArrowPythonRunnerBase(
           GpuSemaphore.releaseIfNecessary(TaskContext.get())
         })
         // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-        GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+        GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
           if (nullable) {
             builder.withColumnNames(name)
           } else {
             builder.withNotNullableColumnNames(name)
           }
         }
-        Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
+        tableWriter =
+          Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
       }
+    }
 
-      var wrote = false
-      Utils.tryWithSafeFinally {
-        while (inputIterator.hasNext) {
-          wrote = false
-          val table = withResource(inputIterator.next()) { nextBatch =>
-            GpuColumnVector.from(nextBatch)
-          }
-          withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
-            // The callback will handle closing table and releasing the semaphore
-            writer.write(table)
-            wrote = true
-          }
-        }
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
-      } {
-        writer.close()
-        dataOut.flush()
-        if (onDataWriteFinished != null) onDataWriteFinished()
+    private def writeBatchToStreamAndClose(batch: ColumnarBatch): Unit = {
+      val table = withResource(batch) { nextBatch =>
+        GpuColumnVector.from(nextBatch)
+      }
+      withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
+        // The callback will handle closing table and releasing the semaphore
+        tableWriter.write(table)
+      }
+    }
+
+    private def close(): Unit = {
+      if (tableWriter != null) {
+        tableWriter.close()
+        tableWriter = null
       }
-      wrote
     }
 
     private def writeEmptyIteratorOnCpu(dataOut: DataOutputStream): Unit = {
+      // For the case that partition is empty.
+      // In this case CPU will still send the schema to Python workers by calling
+      // the "start" API of the Java Arrow writer, but GPU will send out nothing,
+      // leading to the IPC error. And it is not easy to do as what Spark does on
+      // GPU, because the C++ Arrow writer used by GPU will only send out the schema
+      // iff there is some data. Besides, it does not expose a "start" API to do this.
+      // So here we leverage the Java Arrow writer to do similar things as Spark.
+      // It is OK because sending out schema has nothing to do with GPU.
       // most code is copied from Spark
       val arrowSchema = ArrowUtilsShim.toArrowSchema(pythonInSchema, timeZoneId)
       val allocator = ArrowUtils.rootAllocator.newChildAllocator(
@@ -204,13 +252,22 @@ abstract class GpuArrowPythonRunnerBase(
         writer.start()
         // No data to write
         writer.end()
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
       } {
         root.close()
         allocator.close()
-        if (onDataWriteFinished != null) onDataWriteFinished()
       }
     }
   }
 }
+
+object GpuPythonRunnerUtils {
+  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
+    d match {
+      case s: StructType =>
+        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
+      case m: MapType =>
+        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
+      case a: ArrayType => flattenNames(a.elementType, nullable)
+      case _ => Nil
+    }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
index 66c18011a4e..12e2258aaaf 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import ai.rapids.cudf
 import ai.rapids.cudf.{GroupByAggregation, NullPolicy, OrderByArg}
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -499,23 +499,29 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
     // 8) Start processing.
     child.executeColumnar().mapPartitions { inputIter =>
       val context = TaskContext.get()
-      val queue: BatchQueue = new BatchQueue()
-      onTaskCompletion(context)(queue.close())
 
       val boundDataRefs = GpuBindReferences.bindGpuReferences(dataInputs.toSeq, childOutput)
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
-      val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches)
-      val pyInputIterator = groupedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new GroupingIterator(inputIter, boundPartitionRefs, numInputRows, numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val projectedBatch = GpuProjectExec.project(batch, boundDataRefs)
-        // Compute the window bounds and insert to the head of each row for one batch
-        val inputBatch = withResource(projectedBatch) { projectedCb =>
-          insertWindowBounds(projectedCb)
+        val inputBatch = closeOnExcept(batch) { _ =>
+          withResource(GpuProjectExec.project(batch, boundDataRefs)) { projectedCb =>
+            // Compute the window bounds and insert to the head of each row for one batch
+            insertWindowBounds(projectedCb)
+          }
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
         }
-        queue.add(batch)
         inputBatch
       }
 
@@ -534,12 +540,11 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
-          numOutputBatches).map(projectResult(_))
+          numOutputBatches).map(projectResult)
       } else {
         // Empty partition, return the input iterator directly
         inputIter
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
index 1151ad55d8f..0dd0274cec8 100644
--- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/PythonUDFShim.scala
@@ -35,7 +35,6 @@
 {"spark": "333"}
 {"spark": "340"}
 {"spark": "341"}
-{"spark": "341db"}
 spark-rapids-shim-json-lines ***/
 package com.nvidia.spark.rapids.shims
 
diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
index a0baa928d42..185310e35ed 100644
--- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
+++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
@@ -77,6 +77,7 @@ class RapidsShuffleIterator(
     blocksByAddress: Array[(BlockManagerId, Seq[(BlockId, Long, Int)])],
     metricsUpdater: ShuffleMetricsUpdater,
     sparkTypes: Array[DataType],
+    taskAttemptId: Long,
     catalog: ShuffleReceivedBufferCatalog = GpuShuffleEnv.getReceivedCatalog,
     timeoutSeconds: Long = GpuShuffleEnv.shuffleFetchTimeoutSeconds)
   extends Iterator[ColumnarBatch]
@@ -224,7 +225,11 @@ class RapidsShuffleIterator(
           private[this] var clientExpectedBatches = 0L
           private[this] var clientResolvedBatches = 0L
 
-          def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
+          private[this] val taskIds = Array[Long](taskAttemptId)
+
+          override def getTaskIds: Array[Long] = taskIds
+
+          override def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
             if (expectedBatches == 0) {
               throw new IllegalStateException(
                 s"Received an invalid response from shuffle server: " +
@@ -234,7 +239,7 @@ class RapidsShuffleIterator(
             batchesInFlight = batchesInFlight + expectedBatches
             totalBatchesExpected = totalBatchesExpected + expectedBatches
             clientExpectedBatches = expectedBatches
-            logDebug(s"Task: $taskAttemptId Client $blockManagerId " +
+            logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId " +
                 s"Expecting $expectedBatches batches, $batchesInFlight batches currently in " +
                 s"flight, total expected by this client: $clientExpectedBatches, total " +
                 s"resolved by this client: $clientResolvedBatches")
@@ -243,7 +248,7 @@ class RapidsShuffleIterator(
           def clientDone: Boolean = clientExpectedBatches > 0 &&
             clientExpectedBatches == clientResolvedBatches
 
-          def batchReceived(handle: RapidsBufferHandle): Boolean = {
+          override def batchReceived(handle: RapidsBufferHandle): Boolean = {
             resolvedBatches.synchronized {
               if (taskComplete) {
                 false
@@ -259,11 +264,11 @@ class RapidsShuffleIterator(
                   resolvedBatches.offer(BufferReceived(handle))
 
                   if (clientDone) {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"done fetching batches. Total batches expected $clientExpectedBatches, " +
                         s"total batches resolved $clientResolvedBatches.")
                   } else {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"NOT done fetching batches. Total batches expected " +
                         s"$clientExpectedBatches, total batches resolved $clientResolvedBatches.")
                   }
@@ -305,7 +310,7 @@ class RapidsShuffleIterator(
   private[this] def receiveBufferCleaner(): Unit = resolvedBatches.synchronized {
     taskComplete = true
     if (hasNext) {
-      logWarning(s"Iterator for task ${taskAttemptId} closing, " +
+      logWarning(s"Iterator for task ${taskAttemptIdStr} closing, " +
           s"but it is not done. Closing ${resolvedBatches.size()} resolved batches!!")
       resolvedBatches.forEach {
         case BufferReceived(handle) =>
@@ -323,10 +328,8 @@ class RapidsShuffleIterator(
     }
   }
 
-  // Used to print log messages, defaulting to a value for unit tests
-  private[this] lazy val taskAttemptId: String =
-    taskContext.map(_.taskAttemptId().toString)
-        .getOrElse("testTaskAttempt")
+  // Used to print log messages
+  private[this] lazy val taskAttemptIdStr: String = taskAttemptId.toString
 
   private[this] val taskContext: Option[TaskContext] = Option(TaskContext.get())
 
@@ -356,8 +359,7 @@ class RapidsShuffleIterator(
     // fetches and so it could produce device memory. Note this is not allowing for some external
     // thread to schedule the fetches for us, it may be something we consider in the future, given
     // memory pressure.
-    // No good way to get a metric in here for semaphore time.
-    taskContext.foreach(GpuSemaphore.acquireIfNecessary(_))
+    taskContext.foreach(GpuSemaphore.acquireIfNecessary)
 
     if (!started) {
       // kick off if we haven't already
@@ -367,11 +369,11 @@ class RapidsShuffleIterator(
 
     val blockedStart = System.currentTimeMillis()
     var result: Option[ShuffleClientResult] = None
-    RmmSpark.threadCouldBlockOnShuffle()
+    RmmSpark.waitingOnPool()
     try {
       result = pollForResult(timeoutSeconds)
     } finally {
-      RmmSpark.threadDoneWithShuffle()
+      RmmSpark.doneWaitingOnPool()
     }
     val blockedTime = System.currentTimeMillis() - blockedStart
 
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
index 0b118532a28..f9e9119aae8 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
@@ -176,7 +176,7 @@ class RapidsCachingReader[K, C](
 
         val cbArrayFromUcx: Iterator[(K, C)] = if (blocksForRapidsTransport.nonEmpty) {
           val rapidsShuffleIterator = new RapidsShuffleIterator(localId, rapidsConf, transport.get,
-            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes)
+            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes, context.taskAttemptId())
           rapidsShuffleIterator.map(cb => {
             (0, cb)
           }).asInstanceOf[Iterator[(K, C)]]
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
index 7757a0c3582..d5e779011c0 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
@@ -119,7 +119,7 @@ class GpuCoGroupedArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(batchSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(batchSchema).foreach { case (name, nullable) =>
             if (nullable) {
               builder.withColumnNames(name)
             } else {
diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
index 681cdd3b11c..e4685e48e06 100644
--- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
+++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
@@ -62,15 +62,15 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
 
   /**
    * Default to `Int.MaxValue` to try to read as many as possible.
-   * Change it by calling `setMinReadTargetBatchSize` before a reading.
+   * Change it by calling `setMinReadTargetNumRows` before a reading.
    */
-  private var minReadTargetBatchSize: Int = Int.MaxValue
+  private var minReadTargetNumRows: Int = Int.MaxValue
 
   /**
    * Update the expected batch size for next reading.
    */
-  private[python] final def setMinReadTargetBatchSize(size: Int): Unit = {
-    minReadTargetBatchSize = size
+  private[python] final def setMinReadTargetNumRows(numRows: Int): Unit = {
+    minReadTargetNumRows = numRows
   }
 
   /** Convert the table received from the Python side to a batch. */
@@ -128,7 +128,7 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
             // The GpuSemaphore is acquired in a callback
             val table =
               withResource(new NvtxRange("read python batch", NvtxColor.DARK_GREEN)) { _ =>
-                arrowReader.getNextIfAvailable(minReadTargetBatchSize)
+                arrowReader.getNextIfAvailable(minReadTargetNumRows)
               }
             if (table == null) {
               batchLoaded = false
@@ -177,10 +177,9 @@ class GpuArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuArrowPythonRunnerBase(funcs, evalType, argOffsets, pythonInSchema, timeZoneId,
-    conf, batchSize, pythonOutSchema, onDataWriteFinished) {
+    conf, batchSize, pythonOutSchema) {
 
   protected override def newWriterThread(
       env: SparkEnv,
@@ -197,20 +196,8 @@ class GpuArrowPythonRunner(
       }
 
       protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
-        workerImpl.writeInputToStream(dataOut)
+        workerImpl.writeIteratorToStream(dataOut)
       }
     }
   }
 }
-
-object GpuArrowPythonRunner {
-  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
-    d match {
-      case s: StructType =>
-        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
-      case m: MapType =>
-        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
-      case a: ArrayType => flattenNames(a.elementType, nullable)
-      case _ => Nil
-    }
-}
diff --git a/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
index 86d34414991..3d0d3450320 100644
--- a/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
+++ b/sql-plugin/src/main/spark321db/scala/com/nvidia/spark/rapids/shims/GpuWindowInPandasExec.scala
@@ -25,8 +25,7 @@ package com.nvidia.spark.rapids.shims
 import scala.collection.mutable.ArrayBuffer
 
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion
+import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 
 import org.apache.spark.TaskContext
@@ -34,7 +33,7 @@ import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.rapids.execution.python.{BatchQueue, CombiningIterator, GpuPythonHelper, GpuPythonUDF, GpuWindowInPandasExecBase, GroupingIterator}
+import org.apache.spark.sql.rapids.execution.python.{BatchProducer, BatchQueue, CombiningIterator, GpuPythonHelper, GpuWindowInPandasExecBase, GroupingIterator}
 import org.apache.spark.sql.rapids.execution.python.shims.GpuArrowPythonRunner
 import org.apache.spark.sql.rapids.shims.{ArrowUtilsShim, DataTypeUtilsShim}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
@@ -111,9 +110,7 @@ case class GpuWindowInPandasExec(
 
     // 2) Extract window functions, here should be Python (Pandas) UDFs
     val allWindowExpressions = expressionsWithFrameIndex.map(_._1)
-    val udfExpressions = allWindowExpressions.map {
-      case e: GpuWindowExpression => e.windowFunction.asInstanceOf[GpuPythonUDF]
-    }
+    val udfExpressions = PythonUDFShim.getUDFExpressions(allWindowExpressions)
     // We shouldn't be chaining anything here.
     // All chained python functions should only contain one function.
     val (pyFuncs, inputs) = udfExpressions.map(collectFunctions).unzip
@@ -196,23 +193,29 @@ case class GpuWindowInPandasExec(
     // 8) Start processing.
     child.executeColumnar().mapPartitions { inputIter =>
       val context = TaskContext.get()
-      val queue: BatchQueue = new BatchQueue()
-      onTaskCompletion(context)(queue.close())
 
       val boundDataRefs = GpuBindReferences.bindGpuReferences(dataInputs, childOutput)
       // Re-batching the input data by GroupingIterator
       val boundPartitionRefs = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, childOutput)
-      val groupedIterator = new GroupingIterator(inputIter, boundPartitionRefs,
-        numInputRows, numInputBatches)
-      val pyInputIterator = groupedIterator.map { batch =>
+      val batchProducer = new BatchProducer(
+        new GroupingIterator(inputIter, boundPartitionRefs, numInputRows, numInputBatches))
+      val queue = new BatchQueue(batchProducer)
+      val pyInputIterator = batchProducer.asIterator.map { case (batch, isForPeek) =>
         // We have to do the project before we add the batch because the batch might be closed
         // when it is added
-        val projectedBatch = GpuProjectExec.project(batch, boundDataRefs)
-        // Compute the window bounds and insert to the head of each row for one batch
-        val inputBatch = withResource(projectedBatch) { projectedCb =>
-          insertWindowBounds(projectedCb)
+        val inputBatch = closeOnExcept(batch) { _ =>
+          withResource(GpuProjectExec.project(batch, boundDataRefs)) { projectedCb =>
+            // Compute the window bounds and insert to the head of each row for one batch
+            insertWindowBounds(projectedCb)
+          }
+        }
+        if (isForPeek) {
+          batch.close()
+        } else {
+          // We only add the batch that is not for peek, because the batch for peek is already
+          // added by the reader when peeking the next rows number.
+          queue.add(batch)
         }
-        queue.add(batch)
         inputBatch
       }
 
@@ -231,12 +234,11 @@ case class GpuWindowInPandasExec(
           pythonRunnerConf,
           /* The whole group data should be written in a single call, so here is unlimited */
           Int.MaxValue,
-          pythonOutputSchema,
-          () => queue.finish())
+          pythonOutputSchema)
 
         val outputIterator = pyRunner.compute(pyInputIterator, context.partitionId(), context)
         new CombiningIterator(queue, outputIterator, pyRunner, numOutputRows,
-          numOutputBatches).map(projectResult(_))
+          numOutputBatches).map(projectResult)
       } else {
         // Empty partition, return the input iterator directly
         inputIter
diff --git a/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
index e905e0687cd..7eb2bb74b0e 100644
--- a/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
@@ -34,7 +34,7 @@ import com.nvidia.spark.rapids.Arm.withResource
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python._
 import org.apache.spark.sql.execution.python.PythonUDFRunner
-import org.apache.spark.sql.rapids.execution.python.{BufferToStreamWriter, GpuPythonRunnerBase}
+import org.apache.spark.sql.rapids.execution.python.{BufferToStreamWriter, GpuPythonRunnerBase, GpuPythonRunnerUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.Utils
@@ -94,7 +94,7 @@ class GpuGroupUDFArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
               if (nullable) {
                 builder.withColumnNames(name)
               } else {
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
index 4b5d58354ce..39613e9e99c 100644
--- a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
+++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIterator.scala
@@ -64,6 +64,7 @@ class RapidsShuffleIterator(
     blocksByAddress: Array[(BlockManagerId, collection.Seq[(BlockId, Long, Int)])],
     metricsUpdater: ShuffleMetricsUpdater,
     sparkTypes: Array[DataType],
+    taskAttemptId: Long,
     catalog: ShuffleReceivedBufferCatalog = GpuShuffleEnv.getReceivedCatalog,
     timeoutSeconds: Long = GpuShuffleEnv.shuffleFetchTimeoutSeconds)
   extends Iterator[ColumnarBatch]
@@ -211,6 +212,10 @@ class RapidsShuffleIterator(
           private[this] var clientExpectedBatches = 0L
           private[this] var clientResolvedBatches = 0L
 
+          private[this] val taskIds = Array[Long](taskAttemptId)
+
+          override def getTaskIds: Array[Long] = taskIds
+
           def start(expectedBatches: Int): Unit = resolvedBatches.synchronized {
             if (expectedBatches == 0) {
               throw new IllegalStateException(
@@ -221,7 +226,7 @@ class RapidsShuffleIterator(
             batchesInFlight = batchesInFlight + expectedBatches
             totalBatchesExpected = totalBatchesExpected + expectedBatches
             clientExpectedBatches = expectedBatches
-            logDebug(s"Task: $taskAttemptId Client $blockManagerId " +
+            logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId " +
                 s"Expecting $expectedBatches batches, $batchesInFlight batches currently in " +
                 s"flight, total expected by this client: $clientExpectedBatches, total " +
                 s"resolved by this client: $clientResolvedBatches")
@@ -246,11 +251,11 @@ class RapidsShuffleIterator(
                   resolvedBatches.offer(BufferReceived(handle))
 
                   if (clientDone) {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"done fetching batches. Total batches expected $clientExpectedBatches, " +
                         s"total batches resolved $clientResolvedBatches.")
                   } else {
-                    logDebug(s"Task: $taskAttemptId Client $blockManagerId is " +
+                    logDebug(s"Task: $taskAttemptIdStr Client $blockManagerId is " +
                         s"NOT done fetching batches. Total batches expected " +
                         s"$clientExpectedBatches, total batches resolved $clientResolvedBatches.")
                   }
@@ -292,7 +297,7 @@ class RapidsShuffleIterator(
   private[this] def receiveBufferCleaner(): Unit = resolvedBatches.synchronized {
     taskComplete = true
     if (hasNext) {
-      logWarning(s"Iterator for task ${taskAttemptId} closing, " +
+      logWarning(s"Iterator for task ${taskAttemptIdStr} closing, " +
           s"but it is not done. Closing ${resolvedBatches.size()} resolved batches!!")
       resolvedBatches.forEach {
         case BufferReceived(handle) =>
@@ -310,10 +315,8 @@ class RapidsShuffleIterator(
     }
   }
 
-  // Used to print log messages, defaulting to a value for unit tests
-  private[this] lazy val taskAttemptId: String =
-    taskContext.map(_.taskAttemptId().toString)
-        .getOrElse("testTaskAttempt")
+  // Used to print log messages
+  private[this] lazy val taskAttemptIdStr: String = taskAttemptId.toString
 
   private[this] val taskContext: Option[TaskContext] = Option(TaskContext.get())
 
@@ -354,11 +357,11 @@ class RapidsShuffleIterator(
 
     val blockedStart = System.currentTimeMillis()
     var result: Option[ShuffleClientResult] = None
-    RmmSpark.threadCouldBlockOnShuffle()
+    RmmSpark.waitingOnPool()
     try {
       result = pollForResult(timeoutSeconds)
     } finally {
-      RmmSpark.threadDoneWithShuffle()
+      RmmSpark.doneWaitingOnPool()
     }
     val blockedTime = System.currentTimeMillis() - blockedStart
 
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
index 953e3a3ff0b..f0325b7d36d 100644
--- a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
+++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/RapidsCachingReader.scala
@@ -165,7 +165,7 @@ class RapidsCachingReader[K, C](
 
         val cbArrayFromUcx: Iterator[(K, C)] = if (blocksForRapidsTransport.nonEmpty) {
           val rapidsShuffleIterator = new RapidsShuffleIterator(localId, rapidsConf, transport.get,
-            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes)
+            blocksForRapidsTransport.toArray, metricsUpdater, sparkTypes, context.taskAttemptId())
           rapidsShuffleIterator.map(cb => {
             (0, cb)
           }).asInstanceOf[Iterator[(K, C)]]
diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
index 6018f5e51b1..36ffc1db926 100644
--- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
+++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/Spark341PlusDBShims.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
 import org.apache.spark.sql.execution.{CollectLimitExec, GlobalLimitExec, SparkPlan, TakeOrderedAndProjectExec}
 import org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS
 import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null
+import org.apache.spark.sql.rapids.execution.python.GpuPythonUDAF
 import org.apache.spark.sql.types.StringType
 
 trait Spark341PlusDBShims extends Spark332PlusDBShims {
@@ -56,7 +57,35 @@ trait Spark341PlusDBShims extends Spark332PlusDBShims {
         (a, conf, p, r) => new UnaryExprMeta[Empty2Null](a, conf, p, r) {
           override def convertToGpu(child: Expression): GpuExpression = GpuEmpty2Null(child)
         }
-      )
+      ),
+      GpuOverrides.expr[PythonUDAF](
+        "UDF run in an external python process. Does not actually run on the GPU, but " +
+          "the transfer of data to/from it can be accelerated",
+        ExprChecks.fullAggAndProject(
+          // Different types of Pandas UDF support different sets of output type. Please refer to
+          //   https://github.com/apache/spark/blob/master/python/pyspark/sql/udf.py#L98
+          // for more details.
+          // It is impossible to specify the exact type signature for each Pandas UDF type in a
+          // single expression 'PythonUDF'.
+          // So use the 'unionOfPandasUdfOut' to cover all types for Spark. The type signature of
+          // plugin is also an union of all the types of Pandas UDF.
+          (TypeSig.commonCudfTypes + TypeSig.ARRAY).nested() + TypeSig.STRUCT,
+          TypeSig.unionOfPandasUdfOut,
+          repeatingParamCheck = Some(RepeatingParamCheck(
+            "param",
+            (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(),
+            TypeSig.all))),
+        (a, conf, p, r) => new ExprMeta[PythonUDAF](a, conf, p, r) {
+          override def replaceMessage: String = "not block GPU acceleration"
+
+          override def noReplacementPossibleMessage(reasons: String): String =
+            s"blocks running on GPU because $reasons"
+
+          override def convertToGpu(): GpuExpression =
+            GpuPythonUDAF(a.name, a.func, a.dataType,
+              childExprs.map(_.convertToGpu()),
+              a.evalType, a.udfDeterministic, a.resultId)
+        })
     ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
     super.getExprs ++ shimExprs ++ DayTimeIntervalShims.exprs ++ RoundingShims.exprs
   }
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
index 9c245cf2636..a8aa799c484 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.util.Utils
 
 /**
  * Python UDF Runner for cogrouped UDFs, designed for `GpuFlatMapCoGroupsInPandasExec` only.
@@ -40,24 +39,24 @@ import org.apache.spark.util.Utils
  * and receive it back in JVM as batches of single DataFrame.
  */
 class GpuCoGroupedArrowPythonRunner(
-                                     funcs: Seq[ChainedPythonFunctions],
-                                     evalType: Int,
-                                     argOffsets: Array[Array[Int]],
-                                     leftSchema: StructType,
-                                     rightSchema: StructType,
-                                     timeZoneId: String,
-                                     conf: Map[String, String],
-                                     batchSize: Int,
-                                     pythonOutSchema: StructType)
+    funcs: Seq[ChainedPythonFunctions],
+    evalType: Int,
+    argOffsets: Array[Array[Int]],
+    leftSchema: StructType,
+    rightSchema: StructType,
+    timeZoneId: String,
+    conf: Map[String, String],
+    batchSize: Int,
+    pythonOutSchema: StructType)
   extends GpuPythonRunnerBase[(ColumnarBatch, ColumnarBatch)](funcs, evalType, argOffsets)
     with GpuPythonArrowOutput {
 
   protected override def newWriter(
-                                    env: SparkEnv,
-                                    worker: PythonWorker,
-                                    inputIterator: Iterator[(ColumnarBatch, ColumnarBatch)],
-                                    partitionIndex: Int,
-                                    context: TaskContext): Writer = {
+      env: SparkEnv,
+      worker: PythonWorker,
+      inputIterator: Iterator[(ColumnarBatch, ColumnarBatch)],
+      partitionIndex: Int,
+      context: TaskContext): Writer = {
     new Writer(env, worker, inputIterator, partitionIndex, context) {
 
       protected override def writeCommand(dataOut: DataOutputStream): Unit = {
@@ -75,24 +74,24 @@ class GpuCoGroupedArrowPythonRunner(
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
         // For each we first send the number of dataframes in each group then send
         // first df, then send second df.  End of data is marked by sending 0.
-        var wrote = false
-        while (inputIterator.hasNext) {
-          wrote = false
+        if (inputIterator.hasNext) {
           dataOut.writeInt(2)
           val (leftGroupBatch, rightGroupBatch) = inputIterator.next()
           withResource(Seq(leftGroupBatch, rightGroupBatch)) { _ =>
-            wrote = writeGroupBatch(leftGroupBatch, leftSchema, dataOut)
-            wrote = writeGroupBatch(rightGroupBatch, rightSchema, dataOut)
+            writeGroupBatch(leftGroupBatch, leftSchema, dataOut)
+            writeGroupBatch(rightGroupBatch, rightSchema, dataOut)
           }
+          true
+        } else {
+          // The iterator can grab the semaphore even on an empty batch
+          GpuSemaphore.releaseIfNecessary(TaskContext.get())
+          dataOut.writeInt(0)
+          false
         }
-        // The iterator can grab the semaphore even on an empty batch
-        GpuSemaphore.releaseIfNecessary(TaskContext.get())
-        dataOut.writeInt(0)
-        wrote
       }
 
       private def writeGroupBatch(groupBatch: ColumnarBatch, batchSchema: StructType,
-                                  dataOut: DataOutputStream): Boolean = {
+          dataOut: DataOutputStream): Unit = {
         val writer = {
           val builder = ArrowIPCWriterOptions.builder()
           builder.withMaxChunkSize(batchSize)
@@ -101,7 +100,7 @@ class GpuCoGroupedArrowPythonRunner(
             GpuSemaphore.releaseIfNecessary(TaskContext.get())
           })
           // Flatten the names of nested struct columns, required by cudf arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(batchSchema).foreach { case (name, nullable) =>
+          GpuPythonRunnerUtils.flattenNames(batchSchema).foreach { case (name, nullable) =>
             if (nullable) {
               builder.withColumnNames(name)
             } else {
@@ -110,18 +109,20 @@ class GpuCoGroupedArrowPythonRunner(
           }
           Table.writeArrowIPCChunked(builder.build(), new BufferToStreamWriter(dataOut))
         }
-        var wrote = false
-        Utils.tryWithSafeFinally {
+        try {
           withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
             // The callback will handle closing table and releasing the semaphore
             writer.write(GpuColumnVector.from(groupBatch))
-            wrote = true
           }
-        } {
+        } catch {
+          case t: Throwable =>
+            // release the semaphore in case of exception in the middle of writing a batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            throw t
+        } finally {
           writer.close()
           dataOut.flush()
         }
-        wrote
       } // end of writeGroup
     }
   } // end of newWriterThread
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
index c1aea19a194..a9c04808879 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.execution.python.PythonUDFRunner
 import org.apache.spark.sql.rapids.execution.python._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.util.Utils
 
 /**
  * Group Map UDF specific serializer for Databricks because they have a special GroupUDFSerializer.
@@ -82,25 +81,23 @@ class GpuGroupUDFArrowPythonRunner(
       }
 
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
-        var wrote = false
         // write out number of columns
-        Utils.tryWithSafeFinally {
-          val builder = ArrowIPCWriterOptions.builder()
-          builder.withMaxChunkSize(batchSize)
-          builder.withCallback((table: Table) => {
-            table.close()
-            GpuSemaphore.releaseIfNecessary(TaskContext.get())
-          })
-          // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
-          GpuArrowPythonRunner.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
+        try {
+          if (inputIterator.hasNext) {
+            val builder = ArrowIPCWriterOptions.builder()
+            builder.withMaxChunkSize(batchSize)
+            builder.withCallback((table: Table) => {
+              table.close()
+              GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            })
+            // Flatten the names of nested struct columns, required by cudf Arrow IPC writer.
+            GpuPythonRunnerUtils.flattenNames(pythonInSchema).foreach { case (name, nullable) =>
               if (nullable) {
                 builder.withColumnNames(name)
               } else {
                 builder.withNotNullableColumnNames(name)
               }
-          }
-          while(inputIterator.hasNext) {
-            wrote = false
+            }
             val writer = {
               // write 1 out to indicate there is more to read
               dataOut.writeInt(1)
@@ -112,20 +109,24 @@ class GpuGroupUDFArrowPythonRunner(
             withResource(new NvtxRange("write python batch", NvtxColor.DARK_GREEN)) { _ =>
               // The callback will handle closing table and releasing the semaphore
               writer.write(table)
-              wrote = true
             }
             writer.close()
             dataOut.flush()
+            true
+          } else {
+            // The iterator can grab the semaphore even on an empty batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            // tell serializer we are done
+            dataOut.writeInt(0)
+            dataOut.flush()
+            false
           }
-          // indicate not to read more
-          // The iterator can grab the semaphore even on an empty batch
-          GpuSemaphore.releaseIfNecessary(TaskContext.get())
-        } {
-          // tell serializer we are done
-          dataOut.writeInt(0)
-          dataOut.flush()
+        } catch {
+          case t: Throwable =>
+            // release the semaphore in case of exception in the middle of writing a batch
+            GpuSemaphore.releaseIfNecessary(TaskContext.get())
+            throw t
         }
-        wrote
       }
     }
   }
diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
index 35fe8979d94..ab11083561a 100644
--- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
+++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuPythonArrowShims.scala
@@ -40,15 +40,15 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
 
   /**
    * Default to `Int.MaxValue` to try to read as many as possible.
-   * Change it by calling `setMinReadTargetBatchSize` before a reading.
+   * Change it by calling `setMinReadTargetNumRows` before a reading.
    */
-  private var minReadTargetBatchSize: Int = Int.MaxValue
+  private var minReadTargetNumRows: Int = Int.MaxValue
 
   /**
    * Update the expected batch size for next reading.
    */
-  private[python] final def setMinReadTargetBatchSize(size: Int): Unit = {
-    minReadTargetBatchSize = size
+  private[python] final def setMinReadTargetNumRows(numRows: Int): Unit = {
+    minReadTargetNumRows = numRows
   }
 
   /** Convert the table received from the Python side to a batch. */
@@ -106,7 +106,7 @@ trait GpuPythonArrowOutput { _: GpuPythonRunnerBase[_] =>
             // The GpuSemaphore is acquired in a callback
             val table =
               withResource(new NvtxRange("read python batch", NvtxColor.DARK_GREEN)) { _ =>
-                arrowReader.getNextIfAvailable(minReadTargetBatchSize)
+                arrowReader.getNextIfAvailable(minReadTargetNumRows)
               }
             if (table == null) {
               batchLoaded = false
@@ -155,10 +155,9 @@ class GpuArrowPythonRunner(
     timeZoneId: String,
     conf: Map[String, String],
     batchSize: Long,
-    pythonOutSchema: StructType = null,
-    onDataWriteFinished: () => Unit = null)
+    pythonOutSchema: StructType = null)
   extends GpuArrowPythonRunnerBase(funcs, evalType, argOffsets, pythonInSchema, timeZoneId,
-    conf, batchSize, pythonOutSchema, onDataWriteFinished) {
+    conf, batchSize, pythonOutSchema) {
 
   protected override def newWriter(
       env: SparkEnv,
@@ -175,21 +174,8 @@ class GpuArrowPythonRunner(
       }
 
       override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = {
-        workerImpl.writeInputToStream(dataOut)
+        workerImpl.writeNextInputToStream(dataOut)
       }
     }
   }
 }
-
-
-object GpuArrowPythonRunner {
-  def flattenNames(d: DataType, nullable: Boolean = true): Seq[(String, Boolean)] =
-    d match {
-      case s: StructType =>
-        s.flatMap(sf => Seq((sf.name, sf.nullable)) ++ flattenNames(sf.dataType, sf.nullable))
-      case m: MapType =>
-        flattenNames(m.keyType, nullable) ++ flattenNames(m.valueType, nullable)
-      case a: ArrayType => flattenNames(a.elementType, nullable)
-      case _ => Nil
-    }
-}
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
index 5e73221d546..d6d6fa85073 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/HostAllocSuite.scala
@@ -18,8 +18,9 @@ package com.nvidia.spark.rapids
 
 import java.util.concurrent.{ExecutionException, Future, LinkedBlockingQueue, TimeoutException, TimeUnit}
 
-import ai.rapids.cudf.{HostMemoryBuffer, HostMemoryReservation, PinnedMemoryPool, Rmm, RmmAllocationMode}
+import ai.rapids.cudf.{HostMemoryBuffer, PinnedMemoryPool, Rmm, RmmAllocationMode}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
+import com.nvidia.spark.rapids.jni.{RmmSpark, RmmSparkThreadState}
 import org.mockito.Mockito.when
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 import org.scalatest.concurrent.{Signaler, TimeLimits}
@@ -29,10 +30,13 @@ import org.scalatestplus.mockito.MockitoSugar.mock
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
 
 class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     BeforeAndAfterAll with TimeLimits {
+  private val sqlConf = new SQLConf()
+  private val rc = new RapidsConf(sqlConf)
 
   def setMockContext(taskAttemptId: Long): Unit = {
     val context = mock[TaskContext]
@@ -114,16 +118,13 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
   class TaskThread(private val name: String, private val taskId: Long) extends Thread(name) {
     private val queue = new LinkedBlockingQueue[TaskThreadOp[_]]()
-    private var inDoIt: Boolean = false
+    private var nativeThreadId = -1L
 
     def initialize(): Unit = {
       setDaemon(true)
       start()
       val waitForStart = doIt(new TaskThreadOp[Void]() {
-        override def doIt(): Void = {
-          setMockContext(taskId)
-          null
-        }
+        override def doIt(): Void = null
 
         override def toString: String = s"INIT TASK $name TASK $taskId"
       })
@@ -138,29 +139,26 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
     def waitForBlockedOnAlloc(): Unit = {
       val start = System.nanoTime()
-      var (state, inDo) = synchronized {
-        (getState, inDoIt)
-      }
-      while (!isBlockedState(state) && inDo) {
+      var state = RmmSpark.getStateOf(nativeThreadId)
+      while (!isBlockedState(state)) {
         val end = System.nanoTime()
         if (TimeUnit.SECONDS.toNanos(1) <= (end - start)) {
           throw new TimeoutException(s"$name in $state after ${end - start} ns")
         }
         Thread.sleep(10)
         synchronized {
-          state = getState
-          inDo = inDoIt
+          state = RmmSpark.getStateOf(nativeThreadId)
         }
       }
     }
 
-    private def isBlockedState(state: Thread.State): Boolean = state match {
-      case Thread.State.BLOCKED | Thread.State.WAITING | Thread.State.TIMED_WAITING => true
+    private def isBlockedState(state: RmmSparkThreadState): Boolean = state match {
+      case RmmSparkThreadState.THREAD_BUFN | RmmSparkThreadState.THREAD_BLOCKED => true
       case _ => false
     }
 
     def isBlocked: Boolean = synchronized {
-      isBlockedState(getState) && inDoIt
+      isBlockedState(RmmSpark.getStateOf(nativeThreadId))
     }
 
     def doIt[T](op: TaskThreadOp[T]): Future[T] = {
@@ -172,22 +170,26 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
 
     override def run(): Unit = {
       try {
-        while (true) {
-          val op = queue.poll(1000, TimeUnit.MILLISECONDS)
-          if (op.isInstanceOf[TaskThread.TaskThreadDoneOp]) return
-          // null is returned from the queue on a timeout
-          if (op != null) {
-            synchronized {
-              inDoIt = true
-            }
-            try {
-              op.doIt()
-            } finally {
-              synchronized {
-                inDoIt = false
+        this.nativeThreadId = RmmSpark.getCurrentThreadId
+        setMockContext(taskId)
+        RmmSpark.currentThreadIsDedicatedToTask(taskId)
+        try {
+          // Without this the retry does not work properly
+          SQLConf.withExistingConf(sqlConf) {
+            var isDone = false
+            while (!isDone) {
+              val op = queue.poll()
+              if (op.isInstanceOf[TaskThread.TaskThreadDoneOp]) {
+                isDone = true
+              } else if (op != null) {
+                op.doIt()
+              } else {
+                Thread.`yield`()
               }
             }
           }
+        } finally {
+          RmmSpark.removeCurrentDedicatedThreadAssociation(taskId)
         }
       } catch {
         case t: Throwable =>
@@ -262,11 +264,13 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     }
 
     private def doAlloc(): Void = {
-      val tmp = HostAlloc.alloc(size, preferPinned)
-      synchronized {
-        closeOnExcept(tmp) { _ =>
-          assert(b.isEmpty)
-          b = Option(tmp)
+      RmmRapidsRetryIterator.withRetryNoSplit {
+        val tmp = HostAlloc.alloc(size, preferPinned)
+        synchronized {
+          closeOnExcept(tmp) { _ =>
+            assert(b.isEmpty)
+            b = Option(tmp)
+          }
         }
       }
       null
@@ -292,66 +296,6 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     }
   }
 
-  class ReserveOnAnotherThread(val thread: TaskThread,
-      val size: Long,
-      val preferPinned: Boolean = true) extends AutoCloseable {
-    var b: Option[HostMemoryReservation] = None
-    val fb: Future[Void] = thread.doIt(new TaskThreadOp[Void] {
-      override def doIt(): Void = {
-        doReservation()
-        null
-      }
-
-      override def toString: String = "RESERVE(" + size + ")"
-    })
-    var fc: Option[Future[Void]] = None
-
-    def waitForReservation(): HostMemoryReservation = {
-      fb.get(1000, TimeUnit.MILLISECONDS)
-      getReservation()
-    }
-
-    def getReservation(): HostMemoryReservation = synchronized {
-      b.getOrElse {
-        throw new IllegalStateException("No reservation was found")
-      }
-    }
-
-    def closeOnThread(): Unit = {
-      if (fc.isDefined) throw new IllegalStateException("free called multiple times")
-      fc = Option(thread.doIt(new TaskThreadOp[Void]() {
-        override def doIt(): Void = {
-          close()
-          null
-        }
-
-        override def toString: String = "CLOSE(" + size + ")"
-      }))
-    }
-
-    def waitForClose(): Unit = {
-      if (fc.isEmpty) closeOnThread()
-      fc.get.get(1000, TimeUnit.MILLISECONDS)
-    }
-
-    def closeAndWait(): Unit = {
-      waitForClose()
-    }
-
-    private def doReservation(): Void = {
-      val tmp = HostAlloc.reserve(size, preferPinned)
-      synchronized {
-        b = Option(tmp)
-      }
-      null
-    }
-
-    override def close(): Unit = synchronized {
-      b.foreach(_.close())
-      b = None
-    }
-  }
-
   object MyThreadSignaler extends Signaler {
     override def apply(testThread: Thread): Unit = {
       System.err.println("\n\n\t\tTEST THREAD APPEARS TO BE STUCK")
@@ -381,7 +325,6 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
     PinnedMemoryPool.shutdown()
     HostAlloc.initialize(-1)
-    val rc = new RapidsConf(Map.empty[String, String])
     RapidsBufferCatalog.init(rc)
   }
 
@@ -416,10 +359,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
           assert(got2.isEmpty)
         }
       }
-
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -445,9 +386,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
         }
       }
 
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -478,9 +418,8 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
         }
       }
 
-      assertThrows[IllegalArgumentException] {
-        withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { _ =>
-        }
+      withResource(HostAlloc.tryAlloc(4 * 1024 + 1)) { buffer =>
+        assert(buffer.isEmpty)
       }
     }
   }
@@ -717,175 +656,4 @@ class HostAllocSuite extends AnyFunSuite with BeforeAndAfterEach with
       }
     }
   }
-
-  test("simple pinned reservation") {
-    PinnedMemoryPool.initialize(4 * 1024)
-    HostAlloc.initialize(0)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                  assertThrows[OutOfMemoryError] {
-                    withResource(ra.allocate(1)) { _ =>
-
-                    }
-                  }
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 1)) { a2 =>
-            // We ran out of memory, and this should have blocked
-            thread2.waitForBlockedOnAlloc()
-
-            a.closeOnThread()
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
-
-  test("simple non-pinned reservation") {
-    PinnedMemoryPool.initialize(0)
-    HostAlloc.initialize(4 * 1024)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                  assertThrows[OutOfMemoryError] {
-                    withResource(ra.allocate(1)) { _ =>
-
-                    }
-                  }
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 1)) { a2 =>
-            // We ran out of memory, and this should have blocked
-            thread2.waitForBlockedOnAlloc()
-
-            a.closeOnThread()
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
-
-
-  test("simple mixed reservation") {
-    PinnedMemoryPool.initialize(4 * 1024)
-    HostAlloc.initialize(4 * 1024)
-
-    failAfter(Span(10, Seconds)) {
-      val thread1 = new TaskThread("thread1", 1)
-      thread1.initialize()
-      val thread2 = new TaskThread("thread2", 2)
-      thread2.initialize()
-      val thread3 = new TaskThread("thread3", 3)
-      thread3.initialize()
-
-      try {
-        withResource(new ReserveOnAnotherThread(thread1, 1024, preferPinned = false)) { a =>
-          val ra = a.waitForReservation()
-          // reservations should be non-blocking
-          withResource(ra.allocate(1024)) { _ =>
-            // The size matches what we expected
-          }
-          a.closeAndWait()
-        }
-
-        withResource(new ReserveOnAnotherThread(thread1, 4 * 1024)) { a =>
-          val ra = a.waitForReservation()
-          withResource(ra.allocate(1024)) { _ =>
-            withResource(ra.allocate(1024)) { _ =>
-              withResource(ra.allocate(1024)) { _ =>
-                withResource(ra.allocate(1024)) { _ =>
-                }
-              }
-            }
-          }
-
-          withResource(new ReserveOnAnotherThread(thread2, 4096)) { a2 =>
-            val ra2 = a2.waitForReservation()
-            withResource(ra2.allocate(1)) { _ =>
-              // NOOP
-            }
-
-            withResource(new AllocOnAnotherThread(thread3, 1024)) { a3 =>
-              // We ran out of memory, and this should have blocked
-              thread2.waitForBlockedOnAlloc()
-
-              a.closeOnThread()
-
-              a3.waitForAlloc()
-              a3.assertAllocSize(1024)
-              a3.freeAndWait()
-            }
-            a2.closeAndWait()
-          }
-        }
-      } finally {
-        thread1.done.get(1, TimeUnit.SECONDS)
-        thread2.done.get(1, TimeUnit.SECONDS)
-        thread3.done.get(1, TimeUnit.SECONDS)
-      }
-    }
-  }
 }
diff --git a/tests/pom.xml b/tests/pom.xml
index 863c4231b71..de068419ed0 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-tests_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Tests</name>
     <description>RAPIDS plugin for Apache Spark integration tests</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>tests</rapids.module>
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
index 6c9f59e8ece..24a252a4e68 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/BatchWithPartitionDataSuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
@@ -52,7 +52,7 @@ class BatchWithPartitionDataSuite extends RmmSparkRetrySuiteBase with SparkQuery
   }
 
   test("test adding partition values to batch with OOM split and retry - unhandled") {
-    // This test uses single-row partition values that should throw a SplitAndRetryOOM exception
+    // This test uses single-row partition values that should throw a GpuSplitAndRetryOOM exception
     // when a retry is forced.
     val maxGpuColumnSizeBytes = 1000L
     withGpuSparkSession(_ => {
@@ -62,7 +62,7 @@ class BatchWithPartitionDataSuite extends RmmSparkRetrySuiteBase with SparkQuery
           Array(1), partValues.take(1), partSchema, maxGpuColumnSizeBytes)
         RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
         withResource(resultBatchIter) { _ =>
-          assertThrows[SplitAndRetryOOM] {
+          assertThrows[GpuSplitAndRetryOOM] {
             resultBatchIter.next()
           }
         }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
index 2080d79b3ca..c74739edfd8 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GeneratedInternalRowToCudfRowIteratorRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.Table
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.ArgumentMatchers.any
 import org.mockito.Mockito.{doAnswer, spy, times, verify}
 import org.mockito.invocation.InvocationOnMock
@@ -164,7 +164,7 @@ class GeneratedInternalRowToCudfRowIteratorRetrySuite
         ctriter, schema, TargetSize(1),
         NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric)
       RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         myIter.next()
       }
       assertResult(0)(RapidsBufferCatalog.getDeviceStorage.currentSize)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
index 51390189fb1..7db437be6d6 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesRetrySuite.scala
@@ -20,7 +20,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.Table
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatestplus.mockito.MockitoSugar
 
@@ -133,7 +133,7 @@ class GpuCoalesceBatchesRetrySuite
     allBatches.foreach(_.close())
   }
 
-  test("coalesce gpu batches splits in half with SplitAndRetryOOM") {
+  test("coalesce gpu batches splits in half with GpuSplitAndRetryOOM") {
     val iters = getIters(injectSplitAndRetry = 1)
     iters.foreach { iter =>
       withResource(iter.next()) { coalesced =>
@@ -148,7 +148,7 @@ class GpuCoalesceBatchesRetrySuite
     }
   }
 
-  test("coalesce gpu batches splits in quarters with SplitAndRetryOOM") {
+  test("coalesce gpu batches splits in quarters with GpuSplitAndRetryOOM") {
     val iters = getIters(injectSplitAndRetry = 2)
     iters.foreach { iter =>
       withResource(iter.next()) { coalesced =>
@@ -170,7 +170,7 @@ class GpuCoalesceBatchesRetrySuite
   test("coalesce gpu batches fails with OOM if it cannot split enough") {
     val iters = getIters(mockInjectSplitAndRetry = true)
     iters.foreach { iter =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         iter.next() // throws
       }
       val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
@@ -190,10 +190,10 @@ class GpuCoalesceBatchesRetrySuite
     }
   }
 
-  test("coalesce gpu batches throws if SplitAndRetryOOM with non-splittable goal") {
+  test("coalesce gpu batches throws if GpuSplitAndRetryOOM with non-splittable goal") {
     val iters = getIters(injectSplitAndRetry = 1, goal = RequireSingleBatch)
     iters.foreach { iter =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         iter.next()
       }
       val batches = iter.asInstanceOf[CoalesceIteratorMocks].getBatches()
@@ -209,7 +209,7 @@ class GpuCoalesceBatchesRetrySuite
     override def numRows(): Int = 0
     override def setSpillPriority(priority: Long): Unit = {}
     override def getColumnarBatch(): ColumnarBatch = {
-      throw new SplitAndRetryOOM()
+      throw new GpuSplitAndRetryOOM()
     }
     override def sizeInBytes: Long = 0
     override def dataTypes: Array[DataType] = Array.empty
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
index 8e9285d670f..257f8549b0d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuSortRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf.ColumnVector
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark}
 import org.scalatestplus.mockito.MockitoSugar
 
 import org.apache.spark.sql.catalyst.expressions.{Ascending, AttributeReference, ExprId, SortOrder}
@@ -53,12 +53,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-sort RetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-sort GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSortExp = new RetryOOM())
+      firstPassSortExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -68,12 +68,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-sort SplitAndRetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-sort GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSortExp = new SplitAndRetryOOM())
+      firstPassSortExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -83,12 +83,12 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort with retry when first-pass-split RetryOOM") {
+  test("GPU out-of-core sort with retry when first-pass-split GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSplitExp = new RetryOOM())
+      firstPassSplitExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       withResource(outCoreIter.next()) { cb =>
         // only one batch
@@ -98,25 +98,25 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when first-pass-split SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when first-pass-split GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch),
       gpuSorter,
       targetSize = 1024,
-      firstPassSplitExp = new SplitAndRetryOOM())
+      firstPassSplitExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
   }
 
-  test("GPU out-of-core sort with retry when merge-sort-split RetryOOM") {
+  test("GPU out-of-core sort with retry when merge-sort-split GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      mergeSortExp = new RetryOOM())
+      mergeSortExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       var numRows = 0
       while(outCoreIter.hasNext) {
@@ -128,25 +128,25 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when merge-sort-split SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when merge-sort-split GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      mergeSortExp = new SplitAndRetryOOM())
+      mergeSortExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
   }
 
-  test("GPU out-of-core sort with retry when concat-output RetryOOM") {
+  test("GPU out-of-core sort with retry when concat-output GpuRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      concatOutExp = new RetryOOM())
+      concatOutExp = new GpuRetryOOM())
     withResource(outCoreIter) { _ =>
       var numRows = 0
       while (outCoreIter.hasNext) {
@@ -158,14 +158,14 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU out-of-core sort throws when concat-output SplitAndRetryOOM") {
+  test("GPU out-of-core sort throws when concat-output GpuSplitAndRetryOOM") {
     val outCoreIter = new GpuOutOfCoreSortIteratorThatThrows(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
       targetSize = 400,
-      concatOutExp = new SplitAndRetryOOM())
+      concatOutExp = new GpuSplitAndRetryOOM())
     withResource(outCoreIter) { _ =>
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         outCoreIter.next()
       }
     }
@@ -206,7 +206,7 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU each batch sort with RetryOOM") {
+  test("GPU each batch sort with GpuRetryOOM") {
     val eachBatchIter = new GpuSortEachBatchIterator(
       Iterator(buildBatch, buildBatch),
       gpuSorter,
@@ -228,14 +228,14 @@ class GpuSortRetrySuite extends RmmSparkRetrySuiteBase with MockitoSugar {
     }
   }
 
-  test("GPU each batch sort throws SplitAndRetryOOM") {
+  test("GPU each batch sort throws GpuSplitAndRetryOOM") {
     val inputIter = Iterator(buildBatch, buildBatch)
     val eachBatchIter = new GpuSortEachBatchIterator(
       inputIter,
       gpuSorter,
       singleBatch = false)
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       eachBatchIter.next()
     }
     inputIter.foreach(_.close())
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
index 45ecb8a70d8..971ab905f6e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ProjectExprSuite.scala
@@ -65,7 +65,7 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
   }
 
   test("basic retry") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val expr = GpuAlias(GpuAdd(
         GpuBoundReference(0, LongType, true)(NamedExpression.newExprId, "a"),
@@ -90,12 +90,12 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
   test("tiered retry") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val a = AttributeReference("a", LongType)()
       val b = AttributeReference("b", LongType)()
@@ -121,12 +121,12 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
   test("AST retry with split") {
-    RmmSpark.associateCurrentThreadWithTask(0)
+    RmmSpark.currentThreadIsDedicatedToTask(0)
     try {
       val a = AttributeReference("a", LongType)()
       val b = AttributeReference("b", LongType)()
@@ -166,7 +166,7 @@ class ProjectExprSuite extends SparkQueryCompareTestSuite {
         }
       }
     } finally {
-      RmmSpark.removeThreadAssociation(0)
+      RmmSpark.removeCurrentDedicatedThreadAssociation(0)
     }
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
index a8e0ad550ea..876c05f3631 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RmmSparkRetrySuiteBase.scala
@@ -32,6 +32,7 @@ class RmmSparkRetrySuiteBase extends AnyFunSuite with BeforeAndAfterEach {
     super.beforeEach()
     SparkSession.getActiveSession.foreach(_.stop())
     SparkSession.clearActiveSession()
+    RmmSpark.clearEventHandler()
     if (!Rmm.isInitialized) {
       rmmWasInitialized = true
       Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, null, 512 * 1024 * 1024)
@@ -46,14 +47,14 @@ class RmmSparkRetrySuiteBase extends AnyFunSuite with BeforeAndAfterEach {
     RapidsBufferCatalog.setCatalog(catalog)
     val mockEventHandler = new BaseRmmEventHandler()
     RmmSpark.setEventHandler(mockEventHandler)
-    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+    RmmSpark.currentThreadIsDedicatedToTask(1)
   }
 
   override def afterEach(): Unit = {
     super.afterEach()
     SparkSession.getActiveSession.foreach(_.stop())
     SparkSession.clearActiveSession()
-    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.removeAllCurrentThreadAssociation()
     RmmSpark.clearEventHandler()
     RapidsBufferCatalog.close()
     GpuSemaphore.shutdown()
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
index d050a9944c7..5f7c84bf652 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RowToColumnarIteratorRetrySuite.scala
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids
 
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types._
@@ -39,7 +39,7 @@ class RowToColumnarIteratorRetrySuite extends RmmSparkRetrySuiteBase {
     val row2ColIter = new RowToColumnarIterator(
       rowIter, schema, RequireSingleBatch, new GpuRowToColumnConverter(schema))
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       row2ColIter.next()
     }
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
index 93892cfd7c1..e1f5b309d6e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WindowRetrySuite.scala
@@ -18,7 +18,7 @@ package com.nvidia.spark.rapids
 
 import ai.rapids.cudf._
 import com.nvidia.spark.rapids.Arm.withResource
-import com.nvidia.spark.rapids.jni.{RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatestplus.mockito.MockitoSugar
 
@@ -62,7 +62,7 @@ class WindowRetrySuite
     it
   }
 
-  test("row based window handles RetryOOM") {
+  test("row based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -83,7 +83,7 @@ class WindowRetrySuite
     }
   }
 
-  test("optimized-row based window handles RetryOOM") {
+  test("optimized-row based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -104,7 +104,7 @@ class WindowRetrySuite
     }
   }
 
-  test("ranged based window handles RetryOOM") {
+  test("ranged based window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RangeFrame,
       GpuLiteral.create(-1, IntegerType),
@@ -127,7 +127,7 @@ class WindowRetrySuite
     }
   }
 
-  test("SplitAndRetryOOM is not handled in doAggs") {
+  test("GpuSplitAndRetryOOM is not handled in doAggs") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -135,14 +135,14 @@ class WindowRetrySuite
     val it = setupWindowIterator(frame)
     val inputBatch = it.onDeck.get
     RmmSpark.forceSplitAndRetryOOM(RmmSpark.getCurrentThreadId, 1)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       it.next()
     }
     verify(inputBatch, times(1)).getColumnarBatch()
     verify(inputBatch, times(1)).close()
   }
 
-  test("row based group by window handles RetryOOM") {
+  test("row based group by window handles GpuRetryOOM") {
     val frame = GpuSpecifiedWindowFrame(
       RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding),
@@ -172,7 +172,7 @@ class WindowRetrySuite
     }
   }
 
-  test("row-based group by running window handles SplitAndRetryOOM") {
+  test("row-based group by running window handles GpuSplitAndRetryOOM") {
     val runningFrame = GpuSpecifiedWindowFrame(RowFrame,
       GpuSpecialFrameBoundary(UnboundedPreceding), GpuSpecialFrameBoundary(CurrentRow))
     val boundOrderSpec = SortOrder(
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
index c808a36af85..73618bd81ef 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/WithRetrySuite.scala
@@ -20,7 +20,7 @@ import ai.rapids.cudf.{Rmm, RmmAllocationMode, RmmEventHandler, Table}
 import com.nvidia.spark.Retryable
 import com.nvidia.spark.rapids.Arm.withResource
 import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitTargetSizeInHalf, withRestoreOnRetry, withRetry, withRetryNoSplit}
-import com.nvidia.spark.rapids.jni.{RetryOOM, RmmSpark, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM, RmmSpark}
 import org.mockito.Mockito._
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.funsuite.AnyFunSuite
@@ -58,11 +58,11 @@ class WithRetrySuite
     RapidsBufferCatalog.setCatalog(catalog)
     val mockEventHandler = new BaseRmmEventHandler()
     RmmSpark.setEventHandler(mockEventHandler)
-    RmmSpark.associateThreadWithTask(RmmSpark.getCurrentThreadId, 1)
+    RmmSpark.currentThreadIsDedicatedToTask(1)
   }
 
   override def afterEach(): Unit = {
-    RmmSpark.removeThreadAssociation(RmmSpark.getCurrentThreadId)
+    RmmSpark.removeAllCurrentThreadAssociation()
     RmmSpark.clearEventHandler()
     RapidsBufferCatalog.close()
     if (rmmWasInitialized) {
@@ -117,7 +117,7 @@ class WithRetrySuite
         withRetry(myItems.iterator, mockSplitPolicy) { _ =>
           if (!didThrow) {
             didThrow = true
-            throw new SplitAndRetryOOM("in tests")
+            throw new GpuSplitAndRetryOOM("in tests")
           } else {
             throw new IllegalStateException("unhandled exception")
           }
@@ -136,10 +136,10 @@ class WithRetrySuite
 
   test("withRetry closes input on missing split policy") {
     val myItems = Seq(buildBatch, buildBatch)
-    assertThrows[SplitAndRetryOOM] {
+    assertThrows[GpuSplitAndRetryOOM] {
       try {
         withRetry(myItems.iterator, splitPolicy = null) { _ =>
-          throw new SplitAndRetryOOM("unhandled split-and-retry")
+          throw new GpuSplitAndRetryOOM("unhandled split-and-retry")
         }.toSeq
       } finally {
         verify(myItems.head, times(1)).close()
@@ -161,7 +161,7 @@ class WithRetrySuite
           myCheckpointable.value += increment
           if (!didThrow) {
             didThrow = true
-            throw new RetryOOM("in tests")
+            throw new GpuRetryOOM("in tests")
           }
         }
       }
@@ -183,7 +183,7 @@ class WithRetrySuite
             myCheckpointable.value += increment
             if (!didThrow) {
               val ex = new IllegalStateException()
-              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              ex.addSuppressed(new GpuRetryOOM("causedby ex in tests"))
               throw ex
               didThrow = true
             }
@@ -208,7 +208,7 @@ class WithRetrySuite
           myCheckpointables.foreach(_.value += increment)
           if (!didThrow) {
             didThrow = true
-            throw new RetryOOM("in tests")
+            throw new GpuRetryOOM("in tests")
           }
         }
       }
@@ -233,7 +233,7 @@ class WithRetrySuite
             if (!didThrow) {
               didThrow = true
               val ex = new IllegalStateException()
-              ex.addSuppressed(new RetryOOM("causedby ex in tests"))
+              ex.addSuppressed(new GpuRetryOOM("causedby ex in tests"))
               throw ex
             }
           }
@@ -257,7 +257,7 @@ class WithRetrySuite
         lastSplitSize = attempt.targetSize
         if (doThrow > 0) {
           doThrow = doThrow - 1
-          throw new SplitAndRetryOOM("in tests")
+          throw new GpuSplitAndRetryOOM("in tests")
         }
       }.toSeq
     } finally {
@@ -274,12 +274,12 @@ class WithRetrySuite
     var lastSplitSize = 0L
     val myTarget = AutoCloseableTargetSize(initialValue, minValue)
     try {
-      assertThrows[SplitAndRetryOOM] {
+      assertThrows[GpuSplitAndRetryOOM] {
         withRetry(myTarget, splitTargetSizeInHalf) { attempt =>
           lastSplitSize = attempt.targetSize
           if (doThrow > 0) {
             doThrow = doThrow - 1
-            throw new SplitAndRetryOOM("in tests")
+            throw new GpuSplitAndRetryOOM("in tests")
           }
         }.toSeq
       }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
index 22fee313b17..e9873b0bc5f 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleClientSuite.scala
@@ -22,6 +22,7 @@ import ai.rapids.cudf.{DeviceMemoryBuffer, HostMemoryBuffer}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.ShuffleMetadata
 import com.nvidia.spark.rapids.format.{BufferMeta, TableMeta}
+import com.nvidia.spark.rapids.jni.RmmSpark
 import org.mockito.{ArgumentCaptor, ArgumentMatchers}
 import org.mockito.ArgumentMatchers._
 import org.mockito.Mockito._
@@ -70,6 +71,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful metadata fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
     val contigBuffSize = 100000
@@ -77,6 +81,8 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
     val (tableMetas, response) =
       RapidsShuffleTestHelper.mockMetaResponse(mockTransaction, contigBuffSize, numBatches)
 
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     // initialize metadata fetch
     client.doFetch(shuffleRequests.map(_._1).toSeq, mockHandler)
 
@@ -104,12 +110,17 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful degenerate metadata fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
     val numBatches = 3
 
     RapidsShuffleTestHelper.mockDegenerateMetaResponse(mockTransaction, numBatches)
 
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     // initialize metadata fetch
     client.doFetch(shuffleRequests.map(_._1).toSeq, mockHandler)
 
@@ -128,6 +139,11 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   private def doTestErrorOrCancelledMetadataFetch(status: TransactionStatus.Value): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
+
     when(mockTransaction.getStatus).thenReturn(status)
     when(mockTransaction.getErrorMessage).thenReturn(Some("Error/cancel occurred"))
 
@@ -159,6 +175,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("exception in metadata fetch escalates to handler"){
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenThrow(new RuntimeException("test exception"))
     val shuffleRequests = RapidsShuffleTestHelper.getShuffleBlocks
 
@@ -178,6 +198,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 25001
@@ -250,6 +274,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch - but handler rejected it") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     when(mockHandler.batchReceived(any())).thenReturn(false) // reject incoming batches
 
@@ -294,6 +322,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch multi-buffer") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 500
@@ -346,6 +378,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("successful buffer fetch multi-buffer, larger than a single bounce buffer") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
 
     val numRows = 500
@@ -399,6 +435,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   private def doTestErrorOrCancelledBufferFetch(status: TransactionStatus.Value): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     when(mockTransaction.getStatus).thenReturn(status)
     when(mockTransaction.getErrorMessage).thenReturn(Some(s"Status is: ${status}"))
 
@@ -450,6 +490,7 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
     val mockTable = RapidsShuffleTestHelper.mockTableMeta(numRows)
     when(ptr.getLength).thenReturn(mockTable.bufferMeta().size())
     when(ptr.tableMeta).thenReturn(mockTable)
+    when(ptr.handler).thenReturn(mockHandler)
     val buff = HostMemoryBuffer.allocate(mockTable.bufferMeta().size())
     fillBuffer(buff)
     (ptr, buff, mockTable)
@@ -476,6 +517,10 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   def endToEndTest(buff: BounceBuffer,
                    expected: Seq[ReceivedBufferWindow],
                    ptrBuffs: Seq[(PendingTransferRequest, HostMemoryBuffer, TableMeta)]): Unit = {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
+    when(mockHandler.getTaskIds).thenReturn(Array[Long](1))
     withResource(ptrBuffs.map(_._2)) { sources =>
       withResource(new BufferReceiveState(123, buff, ptrBuffs.map(_._1), () => {})) { br =>
         val blocks = sources.map(x => new MockBlock(x))
@@ -623,6 +668,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("on endpoint failure the iterator is notified if it is registered") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val metaResp = ShuffleMetadata.buildMetaResponse(Seq.empty)
     when(mockTransaction.releaseMessage()).thenReturn(
@@ -637,6 +685,9 @@ class RapidsShuffleClientSuite extends RapidsShuffleTestHelper {
   }
 
   test("on endpoint failure the iterator is not notified if it is done (unregistered)") {
+    // This test inherits from RmmSparkRetrySuiteBase, but it sets up Rmm wrong for the current
+    // thread, so we have to fix it for all of the tests
+    RmmSpark.removeAllCurrentThreadAssociation()
     when(mockTransaction.getStatus).thenReturn(TransactionStatus.Success)
     val metaResp = ShuffleMetadata.buildMetaResponse(Seq.empty)
     when(mockTransaction.releaseMessage()).thenReturn(
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
index 4e0325f9048..70064682ed0 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/shuffle/RapidsShuffleIteratorSuite.scala
@@ -29,7 +29,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("inability to get a client raises a fetch failure") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = new RapidsShuffleIterator(
@@ -39,6 +39,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123)
 
@@ -60,7 +61,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   private def doTestErrorOrCancelledRaisesFetchFailure(status: TransactionStatus.Value): Unit = {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       when(mockTransaction.getStatus).thenReturn(status)
 
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
@@ -72,6 +73,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -109,7 +111,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a transport exception raises a fetch failure with the cause exception") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = spy(new RapidsShuffleIterator(
@@ -119,6 +121,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -158,7 +161,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a timeout while waiting for batches raises a fetch failure") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = spy(new RapidsShuffleIterator(
@@ -168,6 +171,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123))
 
@@ -193,7 +197,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
   test("a new good batch is queued") {
     val taskId = 1
     try {
-      RmmSpark.associateCurrentThreadWithTask(taskId)
+      RmmSpark.currentThreadIsDedicatedToTask(taskId)
       val blocksByAddress = RapidsShuffleTestHelper.getBlocksByAddress
 
       val cl = new RapidsShuffleIterator(
@@ -203,6 +207,7 @@ class RapidsShuffleIteratorSuite extends RapidsShuffleTestHelper {
         blocksByAddress,
         testMetricsUpdater,
         Array.empty,
+        taskId,
         mockCatalog,
         123)
 
diff --git a/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala b/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
index 37e71eeb1e4..5aaeae2c7b9 100644
--- a/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
+++ b/tests/src/test/scala/org/apache/spark/sql/rapids/GpuFileFormatDataWriterSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids
 import ai.rapids.cudf.TableWriter
 import com.nvidia.spark.rapids.{ColumnarOutputWriter, ColumnarOutputWriterFactory, GpuBoundReference, GpuColumnVector, RapidsBufferCatalog, RapidsDeviceMemoryStore, ScalableTaskCompletion}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
-import com.nvidia.spark.rapids.jni.{RetryOOM, SplitAndRetryOOM}
+import com.nvidia.spark.rapids.jni.{GpuRetryOOM, GpuSplitAndRetryOOM}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.FSDataOutputStream
 import org.apache.hadoop.mapred.TaskAttemptContext
@@ -502,7 +502,7 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
         // throw once from bufferBatchAndClose to simulate an exception after we call the
         // stats tracker
         mockOutputWriter.throwOnNextBufferBatchAndClose(
-          new SplitAndRetryOOM("mocking a split and retry"))
+          new GpuSplitAndRetryOOM("mocking a split and retry"))
         val dynamicConcurrentWriter =
           prepareDynamicPartitionConcurrentWriter(maxWriters = 5, batchSize = 1)
 
@@ -510,7 +510,7 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
           dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
           dynamicConcurrentWriter.commit()
         } else {
-          assertThrows[SplitAndRetryOOM] {
+          assertThrows[GpuSplitAndRetryOOM] {
             dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
             dynamicConcurrentWriter.commit()
           }
@@ -557,11 +557,11 @@ class GpuFileFormatDataWriterSuite extends AnyFunSuite with BeforeAndAfterEach {
         when(mockJobDescription.statsTrackers)
             .thenReturn(Seq(jobTracker))
         when(statsTracker.newBatch(any(), any()))
-            .thenThrow(new RetryOOM("mocking a retry"))
+            .thenThrow(new GpuRetryOOM("mocking a retry"))
         val dynamicConcurrentWriter =
           prepareDynamicPartitionConcurrentWriter(maxWriters = 5, batchSize = 1)
 
-        assertThrows[RetryOOM] {
+        assertThrows[GpuRetryOOM] {
           dynamicConcurrentWriter.writeWithIterator(cbs.iterator)
           dynamicConcurrentWriter.commit()
         }
diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml
index adf26a680ca..f258e6a6a9f 100644
--- a/udf-compiler/pom.xml
+++ b/udf-compiler/pom.xml
@@ -21,13 +21,13 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-shim-deps-parent_2.12</artifactId>
-        <version>23.12.0-SNAPSHOT</version>
+        <version>24.02.0-SNAPSHOT</version>
         <relativePath>../shim-deps/pom.xml</relativePath>
     </parent>
     <artifactId>rapids-4-spark-udf_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Scala UDF Plugin</name>
     <description>The RAPIDS Scala UDF plugin for Apache Spark</description>
-    <version>23.12.0-SNAPSHOT</version>
+    <version>24.02.0-SNAPSHOT</version>
 
     <properties>
         <rapids.module>udf-compiler</rapids.module>