diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
new file mode 100644
index 00000000000..e7f62399436
--- /dev/null
+++ b/.github/workflows/license-header-check.yml
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.yml,
+            *.yaml,
+            *.sh,
+            *.xml,
+            *.properties,
+            *.scala,
+            *.py,
+            build/*,
+            *.cpp,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.ini,
+            *.java,
+            *.fbs
+          excluded_file_patterns: |
+            *target/*,
+            thirdparty/*,
+            sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
+ 
\ No newline at end of file
diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
index 2884968660d..3480718dbc7 100644
--- a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
+++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DataGenExprShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }
diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
index 0c212d6842a..5b26943a541 100644
--- a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
+++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
 import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
@@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
   private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   override lazy val allMetrics: Map[String, GpuMetric] = {
     Map(
@@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
   }
 
   private lazy val serializer: Serializer =
-    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
+    new GpuColumnarBatchSerializer(allMetrics,
       child.output.map(_.dataType).toArray,
       RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))
 
diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml
index 165d30dde06..9f5c3c100dc 100644
--- a/docs/dev/idea-code-style-settings.xml
+++ b/docs/dev/idea-code-style-settings.xml
@@ -1,3 +1,19 @@
+<!--
+ Copyright (c) 2024, NVIDIA CORPORATION.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> 
+
 <code_scheme name="Default" version="173">
   <option name="SOFT_MARGINS" value="100" />
   <JavaCodeStyleSettings>
diff --git a/integration_tests/src/main/python/datasourcev2_write_test.py b/integration_tests/src/main/python/datasourcev2_write_test.py
index 1f4bc133d2a..4fffd10ab44 100644
--- a/integration_tests/src/main/python/datasourcev2_write_test.py
+++ b/integration_tests/src/main/python/datasourcev2_write_test.py
@@ -18,7 +18,7 @@
 from data_gen import gen_df, decimal_gens, non_utc_allow
 from marks import *
 from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
-from hive_parquet_write_test import _hive_bucket_gens, _hive_array_gens, _hive_struct_gens
+from hive_parquet_write_test import _hive_bucket_gens_sans_bools, _hive_array_gens, _hive_struct_gens
 from hive_parquet_write_test import read_single_bucket
 
 _hive_write_conf = {
@@ -33,9 +33,11 @@
 @allow_non_gpu(*non_utc_allow)
 def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
     num_rows = 2048
-
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     def gen_table(spark):
-        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
         types_sql_str = ','.join('{} {}'.format(
             name, gen.data_type.simpleString()) for name, gen in gen_list)
         col_names_str = ','.join(name for name, gen in gen_list)
diff --git a/integration_tests/src/main/python/hive_parquet_write_test.py b/integration_tests/src/main/python/hive_parquet_write_test.py
index e66b889a986..540db74a1ad 100644
--- a/integration_tests/src/main/python/hive_parquet_write_test.py
+++ b/integration_tests/src/main/python/hive_parquet_write_test.py
@@ -25,9 +25,10 @@
 # "GpuInsertIntoHiveTable" for Parquet write.
 _write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False}
 
-_hive_bucket_gens = [
-    boolean_gen, byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
+_hive_bucket_gens_sans_bools = [
+    byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
     DateGen(start=date(1590, 1, 1)), _restricted_timestamp()]
+_hive_bucket_gens = [boolean_gen] + _hive_bucket_gens_sans_bools
 
 _hive_basic_gens = _hive_bucket_gens + [
     DecimalGen(precision=19, scale=1, nullable=True),
diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py
index 945cc4806fb..af825a99810 100644
--- a/integration_tests/src/main/python/hive_write_test.py
+++ b/integration_tests/src/main/python/hive_write_test.py
@@ -29,8 +29,11 @@ def _restricted_timestamp(nullable=True):
                         end=datetime(2262, 4, 11, tzinfo=timezone.utc),
                         nullable=nullable)
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-                     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+                     string_gen, DateGen(start=date(1590, 1, 1)),
                      _restricted_timestamp()
                ] + decimal_gens
 
@@ -45,8 +48,11 @@ def _restricted_timestamp(nullable=True):
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     lambda nullable=True: _restricted_timestamp(nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 936310bedeb..96021421d62 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -22,7 +22,8 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime
 
-pytestmark = [pytest.mark.nightly_resource_consuming_test]
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]
 
 all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
 all_symmetric_sized_join_types = ['Inner', 'FullOuter']
diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
index 6e8165846e7..b825975f398 100644
--- a/integration_tests/src/main/python/json_test.py
+++ b/integration_tests/src/main/python/json_test.py
@@ -23,6 +23,9 @@
 from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
 from spark_session import *
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 TEXT_INPUT_EXEC='FileSourceScanExec'
 
 # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index 618004ee60d..19894d29aa6 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -112,8 +112,11 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl,
 #E                   	at org.apache.orc.TypeDescription.parseInt(TypeDescription.java:244)
 #E                   	at org.apache.orc.TypeDescription.parseType(TypeDescription.java:362)
 # ...
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 orc_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen] + decimal_gens
 
 orc_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_basic_gens)])
@@ -201,8 +204,11 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e
             read_func(data_path),
             conf=all_confs)
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 orc_pred_push_gens = [
-        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, boolean_gen,
+        byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
         string_gen,
         # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with
         # date_gen
@@ -277,8 +283,11 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea
 def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
     # we should go with a more standard set of generators
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen]
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20'
@@ -344,8 +353,11 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade
 def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs):
     # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed
     # we should go with a more standard set of generators
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-    string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+    string_gen, DateGen(start=date(1590, 1, 1)),
     orc_timestamp_gen]
     first_gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0'
@@ -825,8 +837,11 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_
 @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))])
 @allow_non_gpu(*non_utc_allow_orc_scan)
 def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order):
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-                string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+                string_gen, DateGen(start=date(1590, 1, 1)),
                 orc_timestamp_gen]
     gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
     first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20'
@@ -927,7 +942,10 @@ def test_orc_column_name_with_dots(spark_tmp_path, reader_confs):
                 ("f.g", int_gen),
                 ("h", string_gen)])),
             ("i.j", long_gen)])),
-        ("k", boolean_gen)]
+        # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+        # https://github.com/rapidsai/cudf/issues/6763 .
+        # Once the first issue is fixed, add back boolean_gen for column k
+        ("k", int_gen)]
     with_cpu_session(lambda spark: gen_df(spark, gens).write.orc(data_path))
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
     assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`"), conf=all_confs)
@@ -945,7 +963,10 @@ def test_orc_with_null_column(spark_tmp_path, reader_confs):
     def gen_null_df(spark):
         return spark.createDataFrame(
             [(None, None, None, None, None)],
-            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+            # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+            # https://github.com/rapidsai/cudf/issues/6763 .
+            # Once the first issue is fixed, add back boolean_gen
+            "c1 int, c2 long, c3 float, c4 double, c5 int")
 
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path: gen_null_df(spark).write.orc(path),
@@ -966,7 +987,10 @@ def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs):
     def gen_null_df(spark):
         return spark.createDataFrame(
             data,
-            "c1 int, c2 long, c3 float, c4 double, c5 boolean")
+            # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+            # https://github.com/rapidsai/cudf/issues/6763 .
+            # Once the first issue is fixed, add back boolean_gen
+            "c1 int, c2 long, c3 float, c4 double, c5 int")
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path: gen_null_df(spark).write.orc(path),
         lambda spark, path: spark.read.orc(path),
diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py
index ddb69524ac4..7e415c79a46 100644
--- a/integration_tests/src/main/python/orc_write_test.py
+++ b/integration_tests/src/main/python/orc_write_test.py
@@ -24,9 +24,11 @@
 from pyspark.sql.types import *
 
 pytestmark = pytest.mark.nightly_resource_consuming_test
-
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen.
 orc_write_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-        string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+        string_gen, DateGen(start=date(1590, 1, 1)),
         TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc)) ] + \
         decimal_gens
 
@@ -52,7 +54,8 @@
         all_nulls_map_gen,
         all_empty_map_gen]
 
-orc_write_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)])
+orc_write_basic_struct_gen = StructGen(
+    [['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)])
 
 orc_write_struct_gens_sample = [orc_write_basic_struct_gen,
     StructGen([['child0', byte_gen], ['child1', orc_write_basic_struct_gen]]),
@@ -62,15 +65,18 @@
     ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10),
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
-
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen.
 orc_write_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     # Using timestamps from 1970 to work around a cudf ORC bug
     # https://github.com/NVIDIA/spark-rapids/issues/140.
     lambda nullable=True: TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc), nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
-    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]]
+    lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + [MapGen(
+    f(nullable=False), f(nullable=False)) for f in [IntegerGen]]
 
 orc_write_gens_list = [orc_write_basic_gens,
         orc_write_struct_gens_sample,
@@ -79,6 +85,7 @@
         pytest.param([date_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/139')),
         pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/140'))]
 
+bool_gen = [BooleanGen(nullable=True), BooleanGen(nullable=False)]
 @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 @allow_non_gpu(*non_utc_allow)
@@ -91,6 +98,30 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl):
             data_path,
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
+@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+@allow_non_gpu('ExecutedCommandExec', 'DataWritingCommandExec', 'WriteFilesExec')
+def test_write_round_trip_bools_only_fallback(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
+
+@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn)
+@pytest.mark.parametrize('orc_impl', ["native", "hive"])
+def test_write_round_trip_bools_only_no_fallback(spark_tmp_path, orc_gens, orc_impl):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)]
+    data_path = spark_tmp_path + '/ORC_DATA'
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path),
+        lambda spark, path: spark.read.orc(path),
+        data_path,
+        conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True,
+              'spark.rapids.sql.format.orc.write.boolType.enabled': True})
+
 @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn)
 @pytest.mark.parametrize('orc_impl', ["native", "hive"])
 def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
@@ -103,7 +134,8 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl):
             conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
 
 orc_part_write_gens = [
-        byte_gen, short_gen, int_gen, long_gen, boolean_gen,
+        # Add back boolean_gen when  https://github.com/rapidsai/cudf/issues/6763 is fixed
+        byte_gen, short_gen, int_gen, long_gen,
         # Some file systems have issues with UTF8 strings so to help the test pass even there
         StringGen('(\\w| ){0,50}'),
         # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with
@@ -345,7 +377,10 @@ def test_orc_write_column_name_with_dots(spark_tmp_path):
                 ("f.g", int_gen),
                 ("h", string_gen)])),
             ("i.j", long_gen)])),
-        ("k", boolean_gen)]
+        # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+        # https://github.com/rapidsai/cudf/issues/6763 .
+        # Once the first issue is fixed, add back boolean_gen for column k
+        ("k", int_gen)]
     assert_gpu_and_cpu_writes_are_equal_collect(
         lambda spark, path:  gen_df(spark, gens).coalesce(1).write.orc(path),
         lambda spark, path: spark.read.orc(path),
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index 6aa234003ba..a43d48e5ce9 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -28,6 +28,8 @@
 from spark_session import *
 from conftest import is_databricks_runtime, is_dataproc_runtime
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
 
 def read_parquet_df(data_path):
     return lambda spark : spark.read.parquet(data_path)
diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
index c2062605ca1..b67f9dc6679 100644
--- a/integration_tests/src/main/python/regexp_test.py
+++ b/integration_tests/src/main/python/regexp_test.py
@@ -1012,14 +1012,16 @@ def test_regexp_replace_simple(regexp_enabled):
             'REGEXP_REPLACE(a, "ab", "PROD")',
             'REGEXP_REPLACE(a, "ae", "PROD")',
             'REGEXP_REPLACE(a, "bc", "PROD")',
-            'REGEXP_REPLACE(a, "fa", "PROD")'
+            'REGEXP_REPLACE(a, "fa", "PROD")',
+            'REGEXP_REPLACE(a, "a\n", "PROD")',
+            'REGEXP_REPLACE(a, "\n", "PROD")'
         ),
         conf=conf
     )
 
 @pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
 def test_regexp_replace_multi_optimization(regexp_enabled):
-    gen = mk_str_gen('[abcdef]{0,2}')
+    gen = mk_str_gen('[abcdef\t\n\a]{0,3}')
 
     conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }
 
@@ -1032,7 +1034,9 @@ def test_regexp_replace_multi_optimization(regexp_enabled):
             'REGEXP_REPLACE(a, "aa|bb|cc|dd", "PROD")',
             'REGEXP_REPLACE(a, "(aa|bb)|(cc|dd)", "PROD")',
             'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee", "PROD")',
-            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")'
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")',
+            'REGEXP_REPLACE(a, "a\n|b\a|c\t", "PROD")',
+            'REGEXP_REPLACE(a, "a\ta|b\nb", "PROD")'
         ),
         conf=conf
     )
diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py
index ff501324cc0..57af4a1126e 100644
--- a/integration_tests/src/main/python/schema_evolution_test.py
+++ b/integration_tests/src/main/python/schema_evolution_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,9 @@
 
 # List of additional column data generators to use when adding columns
 _additional_gens = [
-    boolean_gen,
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     byte_gen,
     short_gen,
     int_gen,
@@ -49,7 +51,10 @@
     # simple_string_to_string_map_gen),
     ArrayGen(_custom_date_gen),
     struct_gen_decimal128,
-    StructGen([("c0", ArrayGen(long_gen)), ("c1", boolean_gen)]),
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen from int_gen for c1
+    StructGen([("c0", ArrayGen(long_gen)), ("c1", int_gen)]),
 ]
 
 def get_additional_columns():
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
index 653eaffa940..d6b51342213 100644
--- a/integration_tests/src/main/python/window_function_test.py
+++ b/integration_tests/src/main/python/window_function_test.py
@@ -24,6 +24,9 @@
 from spark_session import is_before_spark_320, is_databricks113_or_later, is_databricks133_or_later, is_spark_350_or_later, spark_version, with_cpu_session
 import warnings
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 _grpkey_longs_with_no_nulls = [
     ('a', RepeatSeqGen(LongGen(nullable=False), length=20)),
     ('b', IntegerGen()),
diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
index 2f8b926898a..147b6a40e98 100644
--- a/jenkins/Jenkinsfile-blossom.premerge-databricks
+++ b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -69,6 +69,7 @@ pipeline {
         DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
         DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
         INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
+        TEST_TYPE = 'pre-commit'
     }
 
     stages {
diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
index dce2436a6e6..183ae846208 100644
--- a/jenkins/databricks/params.py
+++ b/jenkins/databricks/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@
 spark_conf = ''
 # can take comma separated environments, e.g., foo=abc,bar=123,...'
 extra_envs = ''
+# 'nightly' is for nightly CI, 'pre-commit' is for the pre-merge CI
+test_type = 'nightly'
 
 
 def usage():
@@ -51,11 +53,12 @@ def usage():
           ' -n <skipstartingcluster>'
           ' -f <sparkconf>'
           ' -i <sparkinstallver>'
-          ' -e <extraenvs>')
+          ' -e <extraenvs>'
+          ' -m <testtype>')
 
 
 try:
-    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
+    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:m:',
                                       ['workspace=',
                                        'token=',
                                        'clusterid=',
@@ -68,7 +71,8 @@ def usage():
                                        'jarpath=',
                                        'sparkconf=',
                                        'sparkinstallver=',
-                                       'extraenvs='])
+                                       'extraenvs=',
+                                       'testtype='])
 except getopt.GetoptError:
     usage()
     sys.exit(2)
@@ -103,6 +107,8 @@ def usage():
         base_spark_version_to_install_databricks_jars = arg
     elif opt in ('-e', '--extraenvs'):
         extra_envs = arg
+    elif opt in ('-m', '--testtype'):
+        test_type = arg
 
 print('-w is ' + workspace)
 print('-c is ' + clusterid)
@@ -116,3 +122,4 @@ def usage():
 print('-f is ' + spark_conf)
 print('-i is ' + base_spark_version_to_install_databricks_jars)
 print('-e is ' + extra_envs)
+print('-m is ' + test_type)
diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py
index 277c4f7024c..45d5dcea1bd 100644
--- a/jenkins/databricks/run-build.py
+++ b/jenkins/databricks/run-build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,10 +46,12 @@ def main():
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
-  print("Copying built tarball back")
-  rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
-  print("rsync command to get built tarball: %s" % rsync_command)
-  subprocess.check_call(rsync_command, shell = True)
+  # Only the nightly build needs to copy the spark-rapids-built.tgz back
+  if params.test_type == 'nightly':
+      print("Copying built tarball back")
+      rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
+      print("rsync command to get built tarball: %s" % rsync_command)
+      subprocess.check_call(rsync_command, shell = True)
 
 if __name__ == '__main__':
   main()
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
index cd0f8f0e04c..5d02f504061 100644
--- a/jenkins/databricks/run-tests.py
+++ b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@ def main():
     subprocess.check_call(rsync_command, shell=True)
 
     ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
-        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
+        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s TEST_TYPE=%s bash %s %s 2>&1 | tee testout; " \
         "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
-        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
+        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.test_type,
          params.script_dest, ' '.join(params.script_args))
     print("ssh command: %s" % ssh_command)
     try:
diff --git a/pom.xml b/pom.xml
index c5adf511d97..bcfaf4f04af 100644
--- a/pom.xml
+++ b/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>
diff --git a/python/rapids/daemon.py b/python/rapids/daemon.py
index 31353bd92ab..3d00e609255 100644
--- a/python/rapids/daemon.py
+++ b/python/rapids/daemon.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
diff --git a/python/rapids/daemon_databricks.py b/python/rapids/daemon_databricks.py
index 9f4552886e2..fd71bb06f4c 100644
--- a/python/rapids/daemon_databricks.py
+++ b/python/rapids/daemon_databricks.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 8a078e6e0d0..e0aeb7af559 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
index 8fde39eecf8..5a639baafec 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarBatchSerializer.scala
@@ -31,6 +31,10 @@ import com.nvidia.spark.rapids.jni.kudo.{KudoSerializer, KudoTable, KudoTableHea
 
 import org.apache.spark.TaskContext
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer, SerializerInstance}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.{METRIC_DATA_SIZE,
+  METRIC_SHUFFLE_DESER_STREAM_TIME, METRIC_SHUFFLE_SER_CALC_HEADER_TIME,
+  METRIC_SHUFFLE_SER_COPY_BUFFER_TIME, METRIC_SHUFFLE_SER_COPY_HEADER_TIME,
+  METRIC_SHUFFLE_SER_STREAM_TIME}
 import org.apache.spark.sql.types.{DataType, NullType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -45,7 +49,7 @@ trait BaseSerializedTableIterator extends Iterator[(Int, ColumnarBatch)] {
   def peekNextBatchSize(): Option[Long]
 }
 
-class SerializedBatchIterator(dIn: DataInputStream)
+class SerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[SerializedTableHeader] = None
   private[this] var streamClosed: Boolean = false
@@ -58,7 +62,7 @@ class SerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  override def peekNextBatchSize(): Option[Long] = {
+  override def peekNextBatchSize(): Option[Long] = deserTime.ns {
     if (streamClosed) {
       None
     } else {
@@ -78,7 +82,7 @@ class SerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def readNextBatch(): ColumnarBatch = {
+  private def readNextBatch(): ColumnarBatch = deserTime.ns {
     withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
       val header = nextHeader.get
       nextHeader = None
@@ -125,30 +129,45 @@ class SerializedBatchIterator(dIn: DataInputStream)
  *
  * @note The RAPIDS shuffle does not use this code.
  */
-class GpuColumnarBatchSerializer(dataSize: GpuMetric, dataTypes: Array[DataType], useKudo: Boolean)
+class GpuColumnarBatchSerializer(metrics: Map[String, GpuMetric], dataTypes: Array[DataType],
+    useKudo: Boolean)
   extends Serializer with Serializable {
+
+  private lazy val kudo = {
+    if (useKudo && dataTypes.nonEmpty) {
+      Some(new KudoSerializer(GpuColumnVector.from(dataTypes)))
+    } else {
+      None
+    }
+  }
+
   override def newInstance(): SerializerInstance = {
     if (useKudo) {
-      new KudoSerializerInstance(dataSize, dataTypes)
+      new KudoSerializerInstance(metrics, dataTypes, kudo)
     } else {
-      new GpuColumnarBatchSerializerInstance(dataSize)
+      new GpuColumnarBatchSerializerInstance(metrics)
     }
   }
 
   override def supportsRelocationOfSerializedObjects: Boolean = true
 }
 
-private class GpuColumnarBatchSerializerInstance(dataSize: GpuMetric) extends SerializerInstance {
+private class GpuColumnarBatchSerializerInstance(metrics: Map[String, GpuMetric]) extends
+  SerializerInstance {
+  private val dataSize = metrics(METRIC_DATA_SIZE)
+  private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
+  private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)
+
 
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] val dOut: DataOutputStream =
       new DataOutputStream(new BufferedOutputStream(out))
 
-    override def writeValue[T: ClassTag](value: T): SerializationStream = {
+    override def writeValue[T: ClassTag](value: T): SerializationStream = serTime.ns {
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
-      val toClose = new ArrayBuffer[AutoCloseable]()
+      val toClose = new ArrayBuffer[AutoCloseable](numColumns)
       try {
         var startRow = 0
         val numRows = batch.numRows()
@@ -227,7 +246,7 @@ private class GpuColumnarBatchSerializerInstance(dataSize: GpuMetric) extends Se
       private[this] val dIn: DataInputStream = new DataInputStream(new BufferedInputStream(in))
 
       override def asKeyValueIterator: Iterator[(Int, ColumnarBatch)] = {
-        new SerializedBatchIterator(dIn)
+        new SerializedBatchIterator(dIn, deserTime)
       }
 
       override def asIterator: Iterator[Any] = {
@@ -327,20 +346,26 @@ object SerializedTableColumn {
  * @param dataTypes data types of the columns in the batch
  */
 private class KudoSerializerInstance(
-    val dataSize: GpuMetric,
-    val dataTypes: Array[DataType]) extends SerializerInstance {
-
-  private lazy val kudo = new KudoSerializer(GpuColumnVector.from(dataTypes))
+    val metrics: Map[String, GpuMetric],
+    val dataTypes: Array[DataType],
+    val kudo: Option[KudoSerializer]
+) extends SerializerInstance {
+  private val dataSize = metrics(METRIC_DATA_SIZE)
+  private val serTime = metrics(METRIC_SHUFFLE_SER_STREAM_TIME)
+  private val serCalcHeaderTime = metrics(METRIC_SHUFFLE_SER_CALC_HEADER_TIME)
+  private val serCopyHeaderTime = metrics(METRIC_SHUFFLE_SER_COPY_HEADER_TIME)
+  private val serCopyBufferTime = metrics(METRIC_SHUFFLE_SER_COPY_BUFFER_TIME)
+  private val deserTime = metrics(METRIC_SHUFFLE_DESER_STREAM_TIME)
 
   override def serializeStream(out: OutputStream): SerializationStream = new SerializationStream {
     private[this] val dOut: DataOutputStream =
       new DataOutputStream(new BufferedOutputStream(out))
 
-    override def writeValue[T: ClassTag](value: T): SerializationStream = {
+    override def writeValue[T: ClassTag](value: T): SerializationStream = serTime.ns {
       val batch = value.asInstanceOf[ColumnarBatch]
       val numColumns = batch.numCols()
       val columns: Array[HostColumnVector] = new Array(numColumns)
-      withResource(new ArrayBuffer[AutoCloseable]()) { toClose =>
+      withResource(new ArrayBuffer[AutoCloseable](numColumns)) { toClose =>
         var startRow = 0
         val numRows = batch.numRows()
         if (batch.numCols() > 0) {
@@ -368,7 +393,14 @@ private class KudoSerializerInstance(
           }
 
           withResource(new NvtxRange("Serialize Batch", NvtxColor.YELLOW)) { _ =>
-            dataSize += kudo.writeToStream(columns, dOut, startRow, numRows)
+            val writeMetric = kudo
+              .getOrElse(throw new IllegalStateException("Kudo serializer not initialized."))
+              .writeToStreamWithMetrics(columns, dOut, startRow, numRows)
+
+            dataSize += writeMetric.getWrittenBytes
+            serCalcHeaderTime += writeMetric.getCalcHeaderTime
+            serCopyHeaderTime += writeMetric.getCopyHeaderTime
+            serCopyBufferTime += writeMetric.getCopyBufferTime
           }
         } else {
           withResource(new NvtxRange("Serialize Row Only Batch", NvtxColor.YELLOW)) { _ =>
@@ -410,7 +442,7 @@ private class KudoSerializerInstance(
       private[this] val dIn: DataInputStream = new DataInputStream(new BufferedInputStream(in))
 
       override def asKeyValueIterator: Iterator[(Int, ColumnarBatch)] = {
-        new KudoSerializedBatchIterator(dIn)
+        new KudoSerializedBatchIterator(dIn, deserTime)
       }
 
       override def asIterator: Iterator[Any] = {
@@ -483,7 +515,7 @@ object KudoSerializedTableColumn {
   }
 }
 
-class KudoSerializedBatchIterator(dIn: DataInputStream)
+class KudoSerializedBatchIterator(dIn: DataInputStream, deserTime: GpuMetric)
   extends BaseSerializedTableIterator {
   private[this] var nextHeader: Option[KudoTableHeader] = None
   private[this] var streamClosed: Boolean = false
@@ -496,7 +528,7 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  override def peekNextBatchSize(): Option[Long] = {
+  override def peekNextBatchSize(): Option[Long] = deserTime.ns {
     if (streamClosed) {
       None
     } else {
@@ -516,7 +548,7 @@ class KudoSerializedBatchIterator(dIn: DataInputStream)
     }
   }
 
-  private def readNextBatch(): ColumnarBatch = {
+  private def readNextBatch(): ColumnarBatch = deserTime.ns {
     withResource(new NvtxRange("Read Batch", NvtxColor.YELLOW)) { _ =>
       val header = nextHeader.get
       nextHeader = None
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
index 3d9b6285a91..850a04f390f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuExec.scala
@@ -327,19 +327,19 @@ trait GpuExec extends SparkPlan {
     }
   }
 
-  protected def createMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createMetric(sparkContext, name))
 
-  protected def createNanoTimingMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createNanoTimingMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createNanoTimingMetric(sparkContext, name))
 
-  protected def createSizeMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createSizeMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createSizeMetric(sparkContext, name))
 
-  protected def createAverageMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createAverageMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createAverageMetric(sparkContext, name))
 
-  protected def createTimingMetric(level: MetricsLevel, name: String): GpuMetric =
+  def createTimingMetric(level: MetricsLevel, name: String): GpuMetric =
     createMetricInternal(level, SQLMetrics.createTimingMetric(sparkContext, name))
 
   protected def createFileCacheMetrics(): Map[String, GpuMetric] = {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index b9a46f7772f..d24528731b3 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -593,8 +593,9 @@ object GpuOverrides extends Logging {
   }
 
   def isSupportedStringReplacePattern(strLit: String): Boolean = {
-    // check for regex special characters, except for \u0000 which we can support
-    !regexList.filterNot(_ == "\u0000").exists(pattern => strLit.contains(pattern))
+    // check for regex special characters, except for \u0000, \n, \r, and \t which we can support
+    val supported = Seq("\u0000", "\n", "\r", "\t")
+    !regexList.filterNot(supported.contains(_)).exists(pattern => strLit.contains(pattern))
   }
 
   def isSupportedStringReplacePattern(exp: Expression): Boolean = {
@@ -605,7 +606,6 @@ object GpuOverrides extends Logging {
         if (strLit.isEmpty) {
           false
         } else {
-          // check for regex special characters, except for \u0000 which we can support
           isSupportedStringReplacePattern(strLit)
         }
       case _ => false
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
index c33c19cdd8a..62b8e36be65 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
@@ -237,6 +237,7 @@ class KudoTableOperator(
     kudoMergeHeaderTime: GpuMetric,
     kudoMergeBufferTime: GpuMetric) extends SerializedTableOperator[KudoSerializedTableColumn] {
   require(kudo != null, "kudo serializer should not be null")
+  private val kudoTables = new util.ArrayList[KudoTable]()
 
   override def getDataLen(column: KudoSerializedTableColumn): Long = column.kudoTable.getHeader
     .getTotalDataLen
@@ -251,7 +252,8 @@ class KudoTableOperator(
       val totalRowsNum = columns.map(getNumRows).sum
       RowCountOnlyMergeResult(totalRowsNum)
     } else {
-      val kudoTables = new util.ArrayList[KudoTable](columns.length)
+      kudoTables.clear()
+      kudoTables.ensureCapacity(columns.length)
       columns.foreach { column =>
         kudoTables.add(column.kudoTable)
       }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index 093416cc1ee..c34e461b258 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -1268,6 +1268,14 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .booleanConf
     .createWithDefault(true)
 
+  val ENABLE_ORC_BOOL = conf("spark.rapids.sql.format.orc.write.boolType.enabled")
+    .doc("When set to false disables boolean columns for ORC writes. " +
+      "Set to true if you want to experiment. " +
+      "See https://github.com/NVIDIA/spark-rapids/issues/11736.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ENABLE_EXPAND_PREPROJECT = conf("spark.rapids.sql.expandPreproject.enabled")
     .doc("When set to false disables the pre-projection for GPU Expand. " +
       "Pre-projection leverages the tiered projection to evaluate expressions that " +
@@ -3028,6 +3036,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val maxNumOrcFilesParallel: Int = get(ORC_MULTITHREAD_READ_MAX_NUM_FILES_PARALLEL)
 
+  lazy val isOrcBoolTypeEnabled: Boolean = get(ENABLE_ORC_BOOL)
+
   lazy val isCsvEnabled: Boolean = get(ENABLE_CSV)
 
   lazy val isCsvReadEnabled: Boolean = get(ENABLE_CSV_READ)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
index 45889bf89ac..99c3cc9ea5e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutor.scala
@@ -20,6 +20,8 @@ import java.util.concurrent.{Callable, ExecutorService, Future, TimeUnit}
 
 /**
  * Thin wrapper around an ExecutorService that adds throttling.
+ *
+ * The given executor is owned by this class and will be shutdown when this class is shutdown.
  */
 class ThrottlingExecutor(
     val executor: ExecutorService, throttler: TrafficController) {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
index 0110f2d89ca..e69af5bf258 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/io/async/TrafficController.scala
@@ -17,6 +17,7 @@
 package com.nvidia.spark.rapids.io.async
 
 import java.util.concurrent.Callable
+import java.util.concurrent.locks.ReentrantLock
 import javax.annotation.concurrent.GuardedBy
 
 import com.nvidia.spark.rapids.RapidsConf
@@ -85,38 +86,55 @@ class HostMemoryThrottle(val maxInFlightHostMemoryBytes: Long) extends Throttle
  *
  * This class is thread-safe as it is used by multiple tasks.
  */
-class TrafficController protected[rapids] (throttle: Throttle) {
+class TrafficController protected[rapids] (@GuardedBy("lock") throttle: Throttle) {
 
-  @GuardedBy("this")
+  @GuardedBy("lock")
   private var numTasks: Int = 0
 
+  private val lock = new ReentrantLock()
+  private val canBeScheduled = lock.newCondition()
+
   /**
    * Blocks the task from being scheduled until the throttle allows it. If there is no task
    * currently scheduled, the task is scheduled immediately even if the throttle is exceeded.
    */
-  def blockUntilRunnable[T](task: Task[T]): Unit = synchronized {
-    if (numTasks > 0) {
-      while (!throttle.canAccept(task)) {
-        wait(100)
+  def blockUntilRunnable[T](task: Task[T]): Unit = {
+    lock.lockInterruptibly()
+    try {
+      while (numTasks > 0 && !throttle.canAccept(task)) {
+        canBeScheduled.await()
       }
+      numTasks += 1
+      throttle.taskScheduled(task)
+    } finally {
+      lock.unlock()
     }
-    numTasks += 1
-    throttle.taskScheduled(task)
   }
 
-  def taskCompleted[T](task: Task[T]): Unit = synchronized {
-    numTasks -= 1
-    throttle.taskCompleted(task)
-    notify()
+  def taskCompleted[T](task: Task[T]): Unit = {
+    lock.lockInterruptibly()
+    try {
+      numTasks -= 1
+      throttle.taskCompleted(task)
+      canBeScheduled.signal()
+    } finally {
+      lock.unlock()
+    }
   }
 
-  def numScheduledTasks: Int = synchronized {
-    numTasks
+  def numScheduledTasks: Int = {
+    lock.lockInterruptibly()
+    try {
+      numTasks
+    } finally {
+      lock.unlock()
+    }
   }
 }
 
 object TrafficController {
 
+  @GuardedBy("this")
   private var instance: TrafficController = _
 
   /**
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
index d2f4380646c..1d4bc66a1da 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuOrcFileFormat.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.FileFormat
 import org.apache.spark.sql.execution.datasources.orc.{OrcFileFormat, OrcOptions, OrcUtils}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.execution.TrampolineUtil
 import org.apache.spark.sql.types._
 
 object GpuOrcFileFormat extends Logging {
@@ -83,6 +84,11 @@ object GpuOrcFileFormat extends Logging {
     // [[org.apache.spark.sql.execution.datasources.DaysWritable]] object
     // which is a subclass of [[org.apache.hadoop.hive.serde2.io.DateWritable]].
     val types = schema.map(_.dataType).toSet
+    val hasBools = schema.exists { field =>
+      TrampolineUtil.dataTypeExistsRecursively(field.dataType, t =>
+        t.isInstanceOf[BooleanType])
+    }
+
     if (types.exists(GpuOverrides.isOrContainsDateOrTimestamp(_))) {
       if (!GpuOverrides.isUTCTimezone()) {
         meta.willNotWorkOnGpu("Only UTC timezone is supported for ORC. " +
@@ -91,6 +97,10 @@ object GpuOrcFileFormat extends Logging {
       }
     }
 
+    if (hasBools && !meta.conf.isOrcBoolTypeEnabled) {
+      meta.willNotWorkOnGpu("Nullable Booleans can not work in certain cases with ORC writer." +
+        "See https://github.com/rapidsai/cudf/issues/6763")
+    }
     FileFormatChecks.tag(meta, schema, OrcFormatType, WriteFileOp)
 
     val sqlConf = spark.sessionState.conf
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
index 05bc76c3fab..fc255a6bfd0 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
@@ -42,6 +42,7 @@ import org.apache.spark.shuffle.{ShuffleWriter, _}
 import org.apache.spark.shuffle.api._
 import org.apache.spark.shuffle.sort.{BypassMergeSortShuffleHandle, SortShuffleManager}
 import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.{METRIC_DATA_READ_SIZE, METRIC_DATA_SIZE, METRIC_SHUFFLE_COMBINE_TIME, METRIC_SHUFFLE_DESERIALIZATION_TIME, METRIC_SHUFFLE_READ_TIME, METRIC_SHUFFLE_SERIALIZATION_TIME, METRIC_SHUFFLE_WRITE_IO_TIME, METRIC_SHUFFLE_WRITE_TIME}
 import org.apache.spark.sql.rapids.shims.{GpuShuffleBlockResolver, RapidsShuffleThreadedReader, RapidsShuffleThreadedWriter}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.storage.{RapidsShuffleBlockFetcherIterator, _}
@@ -246,13 +247,13 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
         with RapidsShuffleWriterShimHelper {
   private val metrics = handle.metrics
   private val serializationTimeMetric =
-    metrics.get("rapidsShuffleSerializationTime")
+    metrics.get(METRIC_SHUFFLE_SERIALIZATION_TIME)
   private val shuffleWriteTimeMetric =
-    metrics.get("rapidsShuffleWriteTime")
+    metrics.get(METRIC_SHUFFLE_WRITE_TIME)
   private val shuffleCombineTimeMetric =
-    metrics.get("rapidsShuffleCombineTime")
+    metrics.get(METRIC_SHUFFLE_COMBINE_TIME)
   private val ioTimeMetric =
-    metrics.get("rapidsShuffleWriteIoTime")
+    metrics.get(METRIC_SHUFFLE_WRITE_IO_TIME)
   private val dep: ShuffleDependency[K, V, V] = handle.dependency
   private val shuffleId = dep.shuffleId
   private val partitioner = dep.partitioner
@@ -596,9 +597,9 @@ abstract class RapidsShuffleThreadedReaderBase[K, C](
 
   private val sqlMetrics = handle.metrics
   private val dep = handle.dependency
-  private val deserializationTimeNs = sqlMetrics.get("rapidsShuffleDeserializationTime")
-  private val shuffleReadTimeNs = sqlMetrics.get("rapidsShuffleReadTime")
-  private val dataReadSize = sqlMetrics.get("dataReadSize")
+  private val deserializationTimeNs = sqlMetrics.get(METRIC_SHUFFLE_DESERIALIZATION_TIME)
+  private val shuffleReadTimeNs = sqlMetrics.get(METRIC_SHUFFLE_READ_TIME)
+  private val dataReadSize = sqlMetrics.get(METRIC_DATA_READ_SIZE)
 
   private var shuffleReadRange: NvtxRange =
     new NvtxRange("ThreadedReader.read", NvtxColor.PURPLE)
@@ -1048,7 +1049,7 @@ class RapidsCachingWriter[K, V](
     metrics: Map[String, SQLMetric])
   extends RapidsCachingWriterBase[K, V](blockManager, handle, mapId, rapidsShuffleServer, catalog) {
 
-  private val uncompressedMetric: SQLMetric = metrics("dataSize")
+  private val uncompressedMetric: SQLMetric = metrics(METRIC_DATA_SIZE)
 
   // This is here for the special case where we have no columns like with the .count
   // case or when we have 0-byte columns. We pick 100 as an arbitrary number so that
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
index f17cfbac13f..332545a99e1 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
@@ -20,6 +20,7 @@ import scala.collection.AbstractIterator
 import scala.concurrent.Future
 
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.GpuMetric.{DEBUG_LEVEL, ESSENTIAL_LEVEL, MODERATE_LEVEL}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.{GpuHashPartitioning, GpuRangePartitioning, ShimUnaryExecNode, ShuffleOriginUtil, SparkShimImpl}
 
@@ -37,6 +38,7 @@ import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.metric._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.GpuShuffleDependency
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.MutablePair
@@ -195,24 +197,11 @@ abstract class GpuShuffleExchangeExecBase(
     SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext)
   lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
-  override lazy val additionalMetrics : Map[String, GpuMetric] = Map(
-    // dataSize and dataReadSize are uncompressed, one is on write and the 
-    // other on read
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL,"data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL,"rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL,"rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL,"rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   // Spark doesn't report totalTime for this operator so we override metrics
   override lazy val allMetrics: Map[String, GpuMetric] = Map(
@@ -233,7 +222,7 @@ abstract class GpuShuffleExchangeExecBase(
   // This value must be lazy because the child's output may not have been resolved
   // yet in all cases.
   private lazy val serializer: Serializer = new GpuColumnarBatchSerializer(
-    gpuLongMetric("dataSize"), sparkTypes, useKudo)
+    allMetrics, sparkTypes, useKudo)
 
   @transient lazy val inputBatchRDD: RDD[ColumnarBatch] = child.executeColumnar()
 
@@ -276,6 +265,66 @@ abstract class GpuShuffleExchangeExecBase(
 }
 
 object GpuShuffleExchangeExecBase {
+  val METRIC_DATA_SIZE = "dataSize"
+  val METRIC_DESC_DATA_SIZE = "data size"
+  val METRIC_DATA_READ_SIZE = "dataReadSize"
+  val METRIC_DESC_DATA_READ_SIZE = "data read size"
+  val METRIC_SHUFFLE_SERIALIZATION_TIME = "rapidsShuffleSerializationTime"
+  val METRIC_DESC_SHUFFLE_SERIALIZATION_TIME = "RAPIDS shuffle serialization time"
+  val METRIC_SHUFFLE_SER_STREAM_TIME = "rapidsShuffleSerializationStreamTime"
+  val METRIC_DESC_SHUFFLE_SER_STREAM_TIME = "RAPIDS shuffle serialization to output stream time"
+  val METRIC_SHUFFLE_DESERIALIZATION_TIME = "rapidsShuffleDeserializationTime"
+  val METRIC_DESC_SHUFFLE_DESERIALIZATION_TIME = "RAPIDS shuffle deserialization time"
+  val METRIC_SHUFFLE_DESER_STREAM_TIME = "rapidsShuffleDeserializationStreamTime"
+  val METRIC_DESC_SHUFFLE_DESER_STREAM_TIME =
+    "RAPIDS shuffle deserialization from input stream time"
+  val METRIC_SHUFFLE_PARTITION_TIME = "rapidsShufflePartitionTime"
+  val METRIC_DESC_SHUFFLE_PARTITION_TIME = "RAPIDS shuffle partition time"
+  val METRIC_SHUFFLE_WRITE_TIME = "rapidsShuffleWriteTime"
+  val METRIC_DESC_SHUFFLE_WRITE_TIME = "RAPIDS shuffle shuffle write time"
+  val METRIC_SHUFFLE_COMBINE_TIME = "rapidsShuffleCombineTime"
+  val METRIC_DESC_SHUFFLE_COMBINE_TIME = "RAPIDS shuffle shuffle combine time"
+  val METRIC_SHUFFLE_WRITE_IO_TIME = "rapidsShuffleWriteIoTime"
+  val METRIC_DESC_SHUFFLE_WRITE_IO_TIME = "RAPIDS shuffle shuffle write io time"
+  val METRIC_SHUFFLE_READ_TIME = "rapidsShuffleReadTime"
+  val METRIC_DESC_SHUFFLE_READ_TIME = "RAPIDS shuffle shuffle read time"
+  val METRIC_SHUFFLE_SER_CALC_HEADER_TIME = "rapidsShuffleSerializationCalcHeaderTime"
+  val METRIC_DESC_SHUFFLE_SER_CALC_HEADER_TIME = "RAPIDS shuffle serialization calc header time"
+  val METRIC_SHUFFLE_SER_COPY_HEADER_TIME = "rapidsShuffleSerializationCopyHeaderTime"
+  val METRIC_DESC_SHUFFLE_SER_COPY_HEADER_TIME = "RAPIDS shuffle serialization copy header time"
+  val METRIC_SHUFFLE_SER_COPY_BUFFER_TIME = "rapidsShuffleSerializationCopyBufferTime"
+  val METRIC_DESC_SHUFFLE_SER_COPY_BUFFER_TIME = "RAPIDS shuffle serialization copy buffer time"
+
+  def createAdditionalExchangeMetrics(gpu: GpuExec): Map[String, GpuMetric] = Map(
+    // dataSize and dataReadSize are uncompressed, one is on write and the other on read
+    METRIC_DATA_SIZE -> gpu.createSizeMetric(ESSENTIAL_LEVEL, METRIC_DESC_DATA_SIZE),
+    METRIC_DATA_READ_SIZE -> gpu.createSizeMetric(MODERATE_LEVEL, METRIC_DESC_DATA_READ_SIZE),
+    METRIC_SHUFFLE_SERIALIZATION_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL,METRIC_DESC_SHUFFLE_SERIALIZATION_TIME),
+    METRIC_SHUFFLE_SER_STREAM_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_STREAM_TIME),
+    METRIC_SHUFFLE_DESERIALIZATION_TIME  ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_DESERIALIZATION_TIME),
+    METRIC_SHUFFLE_DESER_STREAM_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_DESER_STREAM_TIME),
+    METRIC_SHUFFLE_PARTITION_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_PARTITION_TIME),
+    METRIC_SHUFFLE_WRITE_TIME ->
+        gpu.createNanoTimingMetric(ESSENTIAL_LEVEL, METRIC_DESC_SHUFFLE_WRITE_TIME),
+    METRIC_SHUFFLE_COMBINE_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_COMBINE_TIME),
+    METRIC_SHUFFLE_WRITE_IO_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_WRITE_IO_TIME),
+    METRIC_SHUFFLE_READ_TIME ->
+        gpu.createNanoTimingMetric(ESSENTIAL_LEVEL, METRIC_DESC_SHUFFLE_READ_TIME),
+    METRIC_SHUFFLE_SER_CALC_HEADER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_CALC_HEADER_TIME),
+    METRIC_SHUFFLE_SER_COPY_HEADER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_COPY_HEADER_TIME),
+    METRIC_SHUFFLE_SER_COPY_BUFFER_TIME ->
+        gpu.createNanoTimingMetric(DEBUG_LEVEL, METRIC_DESC_SHUFFLE_SER_COPY_BUFFER_TIME)
+  )
+
   def prepareBatchShuffleDependency(
       rdd: RDD[ColumnarBatch],
       outputAttributes: Seq[Attribute],
@@ -315,8 +364,11 @@ object GpuShuffleExchangeExecBase {
       rdd
     }
     val partitioner: GpuExpression = getPartitioner(newRdd, outputAttributes, newPartitioning)
+    val partitionTime: GpuMetric = metrics(METRIC_SHUFFLE_PARTITION_TIME)
     def getPartitioned: ColumnarBatch => Any = {
-      batch => partitioner.columnarEvalAny(batch)
+      batch => partitionTime.ns {
+        partitioner.columnarEvalAny(batch)
+      }
     }
     val rddWithPartitionIds: RDD[Product2[Int, ColumnarBatch]] = {
       newRdd.mapPartitions { iter =>
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
index e67dfb450d8..8343fe5379b 100644
--- a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
+++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/nvidia/DFUDFShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DFUDFShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
index a8acf240878..86fb692cd64 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/ThrottlingExecutorSuite.scala
@@ -73,17 +73,17 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(0)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("tasks submission fails if total weight exceeds maxWeight") {
+  test("tasks submission fails if totalHostMemoryBytes exceeds maxHostMemoryBytes") {
     val task1 = new TestTask
     val future1 = executor.submit(task1, 10)
     assertResult(1)(trafficController.numScheduledTasks)
     assertResult(10)(throttle.getTotalHostMemoryBytes)
 
     val task2 = new TestTask
-    val task2Weight = 100
+    val task2HostMemory = 100
     val exec = Executors.newSingleThreadExecutor()
     val future2 = exec.submit(new Runnable {
-      override def run(): Unit = executor.submit(task2, task2Weight)
+      override def run(): Unit = executor.submit(task2, task2HostMemory)
     })
     Thread.sleep(100)
     assert(!future2.isDone)
@@ -94,10 +94,10 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     future1.get(longTimeoutSec, TimeUnit.SECONDS)
     future2.get(longTimeoutSec, TimeUnit.SECONDS)
     assertResult(1)(trafficController.numScheduledTasks)
-    assertResult(task2Weight)(throttle.getTotalHostMemoryBytes)
+    assertResult(task2HostMemory)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("submit one task heavier than maxWeight") {
+  test("submit one task heavier than maxHostMemoryBytes") {
     val future = executor.submit(() => Thread.sleep(10), throttle.maxInFlightHostMemoryBytes + 1)
     future.get(longTimeoutSec, TimeUnit.SECONDS)
     assert(future.isDone)
@@ -105,7 +105,7 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(0)(throttle.getTotalHostMemoryBytes)
   }
 
-  test("submit multiple tasks such that total weight does not exceed maxWeight") {
+  test("submit multiple tasks such that totalHostMemoryBytes does not exceed maxHostMemoryBytes") {
     val numTasks = 10
     val taskRunTime = 10
     var future: Future[Unit] = null
@@ -125,10 +125,10 @@ class ThrottlingExecutorSuite extends AnyFunSuite with BeforeAndAfterEach {
     assertResult(10)(throttle.getTotalHostMemoryBytes)
 
     val task2 = new TestTask
-    val task2Weight = 100
+    val task2HostMemory = 100
     val exec = Executors.newSingleThreadExecutor()
     val future2 = exec.submit(new Runnable {
-      override def run(): Unit = executor.submit(task2, task2Weight)
+      override def run(): Unit = executor.submit(task2, task2HostMemory)
     })
     executor.shutdownNow(longTimeoutSec, TimeUnit.SECONDS)
 
diff --git a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
index 32868ff6055..1c06755a8af 100644
--- a/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
+++ b/sql-plugin/src/test/scala/com/nvidia/spark/rapids/io/async/TrafficControllerSuite.scala
@@ -16,19 +16,34 @@
 
 package com.nvidia.spark.rapids.io.async
 
-import java.util.concurrent.{ExecutionException, Executors, ExecutorService, TimeUnit}
+import java.util.concurrent.{ExecutionException, Executors, ExecutorService, Future, TimeUnit}
 
 import org.scalatest.BeforeAndAfterEach
+import org.scalatest.concurrent.TimeLimitedTests
 import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.time.Span
+import org.scalatest.time.SpanSugar._
 
-class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach {
+class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach with TimeLimitedTests {
 
-  private var throttle: HostMemoryThrottle = _
+  class RecordingExecOrderHostMemoryThrottle(maxInFlightHostMemoryBytes: Long)
+    extends HostMemoryThrottle(maxInFlightHostMemoryBytes) {
+    var tasksScheduled = Seq.empty[TestTask]
+
+    override def taskScheduled[T](task: Task[T]): Unit = {
+      tasksScheduled = tasksScheduled :+ task.asInstanceOf[TestTask]
+      super.taskScheduled(task)
+    }
+  }
+
+  val timeLimit: Span = 10.seconds
+
+  private var throttle: RecordingExecOrderHostMemoryThrottle = _
   private var controller: TrafficController = _
   private var executor: ExecutorService = _
 
   override def beforeEach(): Unit = {
-    throttle = new HostMemoryThrottle(100)
+    throttle = new RecordingExecOrderHostMemoryThrottle(100)
     controller = new TrafficController(throttle)
     executor = Executors.newSingleThreadExecutor()
   }
@@ -76,6 +91,63 @@ class TrafficControllerSuite extends AnyFunSuite with BeforeAndAfterEach {
     f.get(1, TimeUnit.SECONDS)
   }
 
+  test("big task should be scheduled after all running tasks are completed") {
+    val taskMemoryBytes = 50
+    val t1 = new TestTask(taskMemoryBytes)
+    controller.blockUntilRunnable(t1)
+
+    val t2 = new TestTask(150)
+    val f = executor.submit(new Runnable {
+      override def run(): Unit = controller.blockUntilRunnable(t2)
+    })
+    Thread.sleep(100)
+    assert(!f.isDone)
+
+    controller.taskCompleted(t1)
+    f.get(1, TimeUnit.SECONDS)
+  }
+
+  test("all tasks are bigger than the total memory limit") {
+    val bigTaskMemoryBytes = 130
+    val (tasks, futures) = (0 to 2).map { _ =>
+      val t = new TestTask(bigTaskMemoryBytes)
+      val f: Future[_] = executor.submit(new Runnable {
+        override def run(): Unit = controller.blockUntilRunnable(t)
+      })
+      (t, f.asInstanceOf[Future[Unit]])
+    }.unzip
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(0).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled.head)(tasks(0))
+
+    // The first task has been completed
+    controller.taskCompleted(tasks(0))
+    // Wait for the second task to be scheduled
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(1).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled(1))(tasks(1))
+
+    // The second task has been completed
+    controller.taskCompleted(tasks(1))
+    // Wait for the third task to be scheduled
+    while (controller.numScheduledTasks == 0) {
+      Thread.sleep(100)
+    }
+    assert(futures(2).isDone)
+    assertResult(1)(controller.numScheduledTasks)
+    assertResult(throttle.tasksScheduled(2))(tasks(2))
+
+    // The third task has been completed
+    controller.taskCompleted(tasks(2))
+    assertResult(0)(controller.numScheduledTasks)
+  }
+
   test("shutdown while blocking") {
     val t1 = new TestTask(10)
     controller.blockUntilRunnable(t1)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
index 3c3933946c5..25c8c10b26d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/StringFunctionSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,7 +207,8 @@ class RegExpUtilsSuite extends AnyFunSuite {
       "aa|bb|cc|dd" -> Seq("aa", "bb", "cc", "dd"),
       "(aa|bb)|(cc|dd)" -> Seq("aa", "bb", "cc", "dd"),
       "aa|bb|cc|dd|ee" -> Seq("aa", "bb", "cc", "dd", "ee"),
-      "aa|bb|cc|dd|ee|ff" -> Seq("aa", "bb", "cc", "dd", "ee", "ff")
+      "aa|bb|cc|dd|ee|ff" -> Seq("aa", "bb", "cc", "dd", "ee", "ff"),
+      "a\n|b\t|c\r" -> Seq("a\n", "b\t", "c\r")
     )
 
     regexChoices.foreach { case (pattern, choices) =>
diff --git a/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala b/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
index fe86900b32f..6d067800dde 100644
--- a/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
+++ b/tests/src/test/scala/org/apache/spark/sql/rapids/OrcFilterSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@ package org.apache.spark.sql.rapids
 
 import java.sql.Timestamp
 
-import com.nvidia.spark.rapids.{GpuFilterExec, SparkQueryCompareTestSuite}
+import com.nvidia.spark.rapids.{GpuFilterExec, RapidsConf, SparkQueryCompareTestSuite}
 
+import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.FilterExec
+import org.apache.spark.sql.execution.{FilterExec, SparkPlan}
 
 class OrcFilterSuite extends SparkQueryCompareTestSuite {
 
@@ -39,22 +40,42 @@ class OrcFilterSuite extends SparkQueryCompareTestSuite {
 
   test("Support for pushing down filters for boolean types gpu write gpu read") {
     withTempPath { file =>
-      withGpuSparkSession(spark => {
-        val data = (0 until 10).map(i => Tuple1(i == 2))
-        val df = spark.createDataFrame(data).toDF("a")
-        df.repartition(10).write.orc(file.getCanonicalPath)
-        checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
-      })
+      var gpuPlans: Array[SparkPlan] = Array.empty
+      val testConf = new SparkConf().set(
+        RapidsConf.TEST_ALLOWED_NONGPU.key,
+        "DataWritingCommandExec,ShuffleExchangeExec, WriteFilesExec")
+      ExecutionPlanCaptureCallback.startCapture()
+      try {
+        withGpuSparkSession(spark => {
+          val data = (0 until 10).map(i => Tuple1(i == 2))
+          val df = spark.createDataFrame(data).toDF("a")
+          df.repartition(10).write.orc(file.getCanonicalPath)
+          checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
+        }, testConf)
+      } finally {
+          gpuPlans = ExecutionPlanCaptureCallback.getResultsWithTimeout()
+      }
+      ExecutionPlanCaptureCallback.assertDidFallBack(gpuPlans.head, "DataWritingCommandExec")
     }
   }
 
   test("Support for pushing down filters for boolean types gpu write cpu read") {
     withTempPath { file =>
-      withGpuSparkSession(spark => {
-        val data = (0 until 10).map(i => Tuple1(i == 2))
-        val df = spark.createDataFrame(data).toDF("a")
-        df.repartition(10).write.orc(file.getCanonicalPath)
-      })
+      var gpuPlans: Array[SparkPlan] = Array.empty
+      val testConf = new SparkConf().set(
+        RapidsConf.TEST_ALLOWED_NONGPU.key,
+        "DataWritingCommandExec,ShuffleExchangeExec, WriteFilesExec")
+      ExecutionPlanCaptureCallback.startCapture()
+      try {
+        withGpuSparkSession(spark => {
+          val data = (0 until 10).map(i => Tuple1(i == 2))
+          val df = spark.createDataFrame(data).toDF("a")
+          df.repartition(10).write.orc(file.getCanonicalPath)
+        }, testConf)
+      } finally {
+          gpuPlans = ExecutionPlanCaptureCallback.getResultsWithTimeout()
+        }
+      ExecutionPlanCaptureCallback.assertDidFallBack(gpuPlans.head, "DataWritingCommandExec")
       withCpuSparkSession(spark => {
         checkPredicatePushDown(spark, file.getCanonicalPath, 10, "a == true")
       })
diff --git a/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
index 3958dce6fdb..c006031da7d 100644
--- a/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
+++ b/tests/src/test/spark321/scala/org/apache/spark/sql/rapids/RapidsShuffleThreadedReaderSuite.scala
@@ -113,7 +113,8 @@ class RapidsShuffleThreadedReaderSuite
       val shuffleId = 22
       val numMaps = 6
       val keyValuePairsPerMap = 10
-      val serializer = new GpuColumnarBatchSerializer(NoopMetric, Array.empty, false)
+      val serializer = new GpuColumnarBatchSerializer(Map.empty.withDefaultValue(NoopMetric),
+        Array.empty, false)
 
       // Make a mock BlockManager that will return RecordingManagedByteBuffers of data, so that we
       // can ensure retain() and release() are properly called.