diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index ecf2f610697..de53e7586cd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -18,11 +18,11 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,8 +30,17 @@
 
 #include <thrust/fill.h>
 
+#include <type_traits>
+#include <vector>
+
 namespace cudf {
 namespace detail {
+template <typename T>
+constexpr bool is_product_supported()
+{
+  return is_numeric<T>();
+}
+
 /**
  * @brief Maps an `aggregation::Kind` value to it's corresponding binary
  * operator.
@@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator()
   return !std::is_same_v<typename corresponding_operator<k>::type, void>;
 }
 
-template <typename Source,
-          aggregation::Kind k,
-          bool target_has_nulls,
-          bool source_has_nulls,
-          typename Enable = void>
-struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MIN>;
-    cudf::detail::atomic_min(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MIN>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !is_fixed_point<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::MAX>;
-    cudf::detail::atomic_max(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::MAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::MAX>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
-                   !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::SUM,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_fixed_point<Source>() &&
-                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target       = target_type_t<Source, aggregation::SUM>;
-    using DeviceTarget = device_storage_type_t<Target>;
-    using DeviceSource = device_storage_type_t<Source>;
-
-    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
-                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column using
- * the dictionary key addressed by the specific index.
- *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
- * dictionary.
- *
- */
-template <bool target_has_nulls = true>
-struct update_target_from_dictionary {
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, false>{}(
-      target, target_index, source, source_index);
-  }
-  template <typename Source,
-            aggregation::Kind k,
-            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-  }
-};
-
-/**
- * @brief Specialization function for dictionary type and aggregations.
- *
- * The `source` column is a dictionary type. This functor de-references the
- * dictionary's keys child column and maps the input source index through
- * the dictionary's indices child column to pass to the `update_target_element`
- * in the above `update_target_from_dictionary` using the type-dispatcher to
- * resolve the keys column type.
- *
- * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  dictionary32,
-  k,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
-                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    dispatch_type_and_aggregation(
-      source.child(cudf::dictionary_column_view::keys_column_index).type(),
-      k,
-      update_target_from_dictionary<target_has_nulls>{},
-      target,
-      target_index,
-      source.child(cudf::dictionary_column_view::keys_column_index),
-      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
-  }
-};
-
-template <typename T>
-constexpr bool is_product_supported()
-{
-  return is_numeric<T>();
-}
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM_OF_SQUARES,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
-    auto value   = static_cast<Target>(source.element<Source>(source_index));
-    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::PRODUCT,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_product_supported<Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::PRODUCT>;
-    cudf::detail::atomic_mul(&target.element<Target>(target_index),
-                             static_cast<Target>(source.element<Source>(source_index)));
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_VALID,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_VALID is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::COUNT_ALL,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
-    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
-
-    // It is assumed the output for COUNT_ALL is initialized to be all valid
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMAX,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMAX>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
-    if (old != ARGMAX_SENTINEL) {
-      while (source.element<Source>(source_index) > source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<
-  Source,
-  aggregation::ARGMIN,
-  target_has_nulls,
-  source_has_nulls,
-  std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
-                   cudf::is_relationally_comparable<Source, Source>()>> {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    if (source_has_nulls and source.is_null(source_index)) { return; }
-
-    using Target = target_type_t<Source, aggregation::ARGMIN>;
-    auto old     = cudf::detail::atomic_cas(
-      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
-    if (old != ARGMIN_SENTINEL) {
-      while (source.element<Source>(source_index) < source.element<Source>(old)) {
-        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
-      }
-    }
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-  }
-};
-
-/**
- * @brief Function object to update a single element in a target column by
- * performing an aggregation operation with a single element from a source
- * column.
- *
- * @tparam target_has_nulls Indicates presence of null elements in `target`
- * @tparam source_has_nulls Indicates presence of null elements in `source`.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-struct elementwise_aggregator {
-  template <typename Source, aggregation::Kind k>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    update_target_element<Source, k, target_has_nulls, source_has_nulls>{}(
-      target, target_index, source, source_index);
-  }
-};
-
-/**
- * @brief Updates a row in `target` by performing elementwise aggregation
- * operations with a row in `source`.
- *
- * For the row in `target` specified by `target_index`, each element at `i` is
- * updated by:
- * ```c++
- * target_row[i] = aggs[i](target_row[i], source_row[i])
- * ```
- *
- * This function only supports aggregations that can be done in a "single pass",
- * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
- * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
- * of the values of `i`.
- *
- * The initial value and validity of `R` depends on the aggregation:
- * SUM: 0 and NULL
- * MIN: Max value of type and NULL
- * MAX: Min value of type and NULL
- * COUNT_VALID: 0 and VALID
- * COUNT_ALL:   0 and VALID
- * ARGMAX: `ARGMAX_SENTINEL` and NULL
- * ARGMIN: `ARGMIN_SENTINEL` and NULL
- *
- * It is required that the elements of `target` be initialized with the corresponding
- * initial values and validity specified above.
- *
- * Handling of null elements in both `source` and `target` depends on the aggregation:
- * SUM, MIN, MAX, ARGMIN, ARGMAX:
- *  - `source`: Skipped
- *  - `target`: Updated from null to valid upon first successful aggregation
- * COUNT_VALID, COUNT_ALL:
- *  - `source`: Skipped
- *  - `target`: Cannot be null
- *
- * @param target Table containing the row to update
- * @param target_index Index of the row to update in `target`
- * @param source Table containing the row used to update the row in `target`.
- * The invariant `source.num_columns() >= target.num_columns()` must hold.
- * @param source_index Index of the row to use in `source`
- * @param aggs Array of aggregations to perform between elements of the `target`
- * and `source` rows. Must contain at least `target.num_columns()` valid
- * `aggregation::Kind` values.
- */
-template <bool target_has_nulls = true, bool source_has_nulls = true>
-__device__ inline void aggregate_row(mutable_table_device_view target,
-                                     size_type target_index,
-                                     table_device_view source,
-                                     size_type source_index,
-                                     aggregation::Kind const* aggs)
-{
-  for (auto i = 0; i < target.num_columns(); ++i) {
-    dispatch_type_and_aggregation(source.column(i).type(),
-                                  aggs[i],
-                                  elementwise_aggregator<target_has_nulls, source_has_nulls>{},
-                                  target.column(i),
-                                  target_index,
-                                  source.column(i),
-                                  source_index);
-  }
-}
-
 /**
  * @brief Dispatched functor to initialize a column with the identity of an
  * aggregation operation.
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
new file mode 100644
index 00000000000..10be5e1d36f
--- /dev/null
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/type_traits>
+
+namespace cudf::detail {
+template <typename Source, aggregation::Kind k, typename Enable = void>
+struct update_target_element {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MIN>;
+    cudf::detail::atomic_min(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MIN>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !is_fixed_point<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::MAX>;
+    cudf::detail::atomic_max(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::MAX>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_fixed_point<Source>() && !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  cuda::std::enable_if_t<is_fixed_point<Source>() &&
+                         cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target       = target_type_t<Source, aggregation::SUM>;
+    using DeviceTarget = device_storage_type_t<Target>;
+    using DeviceSource = device_storage_type_t<Source>;
+
+    cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column using
+ * the dictionary key addressed by the specific index.
+ *
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
+ *
+ */
+struct update_target_from_dictionary {
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+  template <typename Source,
+            aggregation::Kind k,
+            cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+  }
+};
+
+/**
+ * @brief Specialization function for dictionary type and aggregations.
+ *
+ * The `source` column is a dictionary type. This functor de-references the
+ * dictionary's keys child column and maps the input source index through
+ * the dictionary's indices child column to pass to the `update_target_element`
+ * in the above `update_target_from_dictionary` using the type-dispatcher to
+ * resolve the keys column type.
+ *
+ * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
+ */
+template <aggregation::Kind k>
+struct update_target_element<
+  dictionary32,
+  k,
+  cuda::std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    dispatch_type_and_aggregation(
+      source.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary{},
+      target,
+      target_index,
+      source.child(cudf::dictionary_column_view::keys_column_index),
+      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::SUM_OF_SQUARES,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
+    auto value   = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<Source,
+                             aggregation::PRODUCT,
+                             cuda::std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::PRODUCT>;
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source.element<Source>(source_index)));
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::COUNT_VALID>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    using Target = target_type_t<Source, aggregation::COUNT_ALL>;
+    cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMAX,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMAX>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
+    if (old != ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element<
+  Source,
+  aggregation::ARGMIN,
+  cuda::std::enable_if_t<is_valid_aggregation<Source, aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::ARGMIN>;
+    auto old     = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
+    if (old != ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief Function object to update a single element in a target column by
+ * performing an aggregation operation with a single element from a source
+ * column.
+ */
+struct elementwise_aggregator {
+  template <typename Source, aggregation::Kind k>
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    update_target_element<Source, k>{}(target, target_index, source, source_index);
+  }
+};
+
+/**
+ * @brief Updates a row in `target` by performing elementwise aggregation
+ * operations with a row in `source`.
+ *
+ * For the row in `target` specified by `target_index`, each element at `i` is
+ * updated by:
+ * ```c++
+ * target_row[i] = aggs[i](target_row[i], source_row[i])
+ * ```
+ *
+ * This function only supports aggregations that can be done in a "single pass",
+ * i.e., given an initial value `R`, the aggregation `op` can be computed on a series
+ * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order
+ * of the values of `i`.
+ *
+ * The initial value and validity of `R` depends on the aggregation:
+ * SUM: 0 and NULL
+ * MIN: Max value of type and NULL
+ * MAX: Min value of type and NULL
+ * COUNT_VALID: 0 and VALID
+ * COUNT_ALL:   0 and VALID
+ * ARGMAX: `ARGMAX_SENTINEL` and NULL
+ * ARGMIN: `ARGMIN_SENTINEL` and NULL
+ *
+ * It is required that the elements of `target` be initialized with the corresponding
+ * initial values and validity specified above.
+ *
+ * Handling of null elements in both `source` and `target` depends on the aggregation:
+ * SUM, MIN, MAX, ARGMIN, ARGMAX:
+ *  - `source`: Skipped
+ *  - `target`: Updated from null to valid upon first successful aggregation
+ * COUNT_VALID, COUNT_ALL:
+ *  - `source`: Skipped
+ *  - `target`: Cannot be null
+ *
+ * @param target Table containing the row to update
+ * @param target_index Index of the row to update in `target`
+ * @param source Table containing the row used to update the row in `target`.
+ * The invariant `source.num_columns() >= target.num_columns()` must hold.
+ * @param source_index Index of the row to use in `source`
+ * @param aggs Array of aggregations to perform between elements of the `target`
+ * and `source` rows. Must contain at least `target.num_columns()` valid
+ * `aggregation::Kind` values.
+ */
+__device__ inline void aggregate_row(mutable_table_device_view target,
+                                     size_type target_index,
+                                     table_device_view source,
+                                     size_type source_index,
+                                     aggregation::Kind const* aggs)
+{
+  for (auto i = 0; i < target.num_columns(); ++i) {
+    dispatch_type_and_aggregation(source.column(i).type(),
+                                  aggs[i],
+                                  elementwise_aggregator{},
+                                  target.column(i),
+                                  target_index,
+                                  source.column(i),
+                                  source_index);
+  }
+}
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 80a4460023f..4295f5e6ddd 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,28 +143,30 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest column.
+ * @brief Create a tdigest column of empty tdigests.
  *
- * An empty tdigest column contains a single row of length 0
+ * The column created contains the specified number of rows of empty tdigests.
  *
+ * @param num_rows The number of rows in the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest column.
+ * @returns A tdigest column of empty clusters.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest scalar.
+ * @brief Create a scalar of an empty tdigest cluster.
  *
- * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
+ * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest scalar.
+ * @returns A scalar of an empty tdigest cluster.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 632d5a732ec..4f0c52c5954 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,33 +26,82 @@ namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async_impl(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = src.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
 
 /**
- * @brief Synchronously copies data between the host and device.
+ * @brief Asynchronously copies data from device to host memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination host memory
+ * @param src Source device memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = dst.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
+
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
+
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 953ae5b9308..1f1e7a2db77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  auto const is_pinned = source_data.is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    source_data.data(),
-                    source_data.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async<T>(ret, source_data, stream);
   return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = make_host_vector<T>(v.size(), stream);
-  auto const is_pinned = result.get_allocator().is_device_accessible();
-  cuda_memcpy_async(result.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  auto result = make_host_vector<T>(v.size(), stream);
+  cuda_memcpy_async<T>(result, v, stream);
   return result;
 }
 
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 3ebe5cb53e9..f229facca08 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -30,7 +30,7 @@ namespace strings {
  */
 
 /**
- * @brief Returns a boolean column identifying strings entries in which all
+ * @brief Returns a boolean column identifying string entries where all
  * characters are of the type specified.
  *
  * The output row entry will be set to false if the corresponding string element
@@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
  *        `types_to_remove` will be filtered.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New column of boolean results for each string
+ * @return New strings column with the characters of specified types filtered out and replaced by
+ * the specified replacement string
  */
 std::unique_ptr<column> filter_characters_of_type(
   strings_column_view const& input,
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 1758790cd64..c259d61060b 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 02998b84ffd..d915c85bf85 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,13 @@
  */
 
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <vector>
+
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 9abfe22950a..188d0cff3f1 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -18,8 +18,8 @@
 
 #include "multi_pass_kernels.cuh"
 
-#include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -100,7 +100,7 @@ struct compute_single_pass_aggs_fn {
     if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) {
       auto const result = set.insert_and_find(i);
 
-      cudf::detail::aggregate_row<true, true>(output_values, *result.first, input_values, i, aggs);
+      cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs);
     }
   }
 };
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 2358f47bbbb..f9adfc6060e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -25,6 +25,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 5855f1b5a5f..f7e8134b68d 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
+    cudf::detail::cuda_memcpy_async<NodeT>(
+      d_column_tree.node_categories, column_categories, stream);
   }
 
   // ignore all children of columns forced as string
@@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         forced_as_string_column[this_col_id])
       column_categories[this_col_id] = NC_STR;
   }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index bf81162a0ac..76816071d8c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -620,10 +620,12 @@ struct PdaSymbolToSymbolGroupId {
     // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
     // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
     // escape, comma, colon or whitespace characters.
+    auto constexpr newline    = '\n';
+    auto constexpr whitespace = ' ';
     auto const symbol_position =
       symbol == delimiter
-        ? static_cast<int32_t>('\n')
-        : (symbol == '\n' ? static_cast<int32_t>(' ') : static_cast<int32_t>(symbol));
+        ? static_cast<int32_t>(newline)
+        : (symbol == newline ? static_cast<int32_t>(whitespace) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
       tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index aed745c42dd..634e6d78ebc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,23 +125,17 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(d_data, h_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(h_data, d_data, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
   /**
    * @brief Converts a hostdevice_vector into a hostdevice_span.
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0d017cf1f13..43c3b0a291b 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto min_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto max_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(1,
-                             make_empty_column(type_id::FLOAT64),
-                             make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(num_rows,
+                             cudf::make_empty_column(type_id::FLOAT64),
+                             cudf::make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_empty_tdigest_column(stream, mr)->release();
+  auto contents = make_empty_tdigests_column(1, stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index e1c1d2e3002..b0a84a6d50c 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -169,19 +169,19 @@ struct nearest_value_scalar_weights {
  */
 template <typename GroupOffsetsIter>
 struct nearest_value_centroid_weights {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;  // groups
-  size_type const* inner_offsets;  // tdigests within a group
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;    // groups
+  size_type const* tdigest_offsets;  // tdigests within a group
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    auto const num_weights   = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
     // NOTE: as it is today, this functor will never be called for any digests that are empty, but
     // I'll leave this check here for safety.
     if (num_weights == 0) { return thrust::pair<double, int>{0, 0}; }
-    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin];
 
     auto const index = ((thrust::lower_bound(thrust::seq,
                                              group_cumulative_weights,
@@ -235,21 +235,26 @@ struct cumulative_scalar_weight {
  */
 template <typename GroupLabelsIter, typename GroupOffsetsIter>
 struct cumulative_centroid_weight {
-  double const* cumulative_weights;
-  GroupLabelsIter group_labels;
-  GroupOffsetsIter outer_offsets;                    // groups
-  cudf::device_span<size_type const> inner_offsets;  // tdigests with a group
-
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupLabelsIter group_labels;      // group labels for each tdigest including empty ones
+  GroupOffsetsIter group_offsets;    // groups
+  cudf::device_span<size_type const> tdigest_offsets;  // tdigests with a group
+
+  /**
+   * @brief Returns the cumulative weight for a given value index. The index `n` is the index of
+   * `n`-th non-empty cluster.
+   */
   std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
   {
     auto const tdigest_index =
       static_cast<size_type>(
-        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
-        inner_offsets.begin()) -
+        thrust::upper_bound(
+          thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) -
+        tdigest_offsets.begin()) -
       1;
     auto const group_index                 = group_labels[tdigest_index];
-    auto const first_tdigest_index         = outer_offsets[group_index];
-    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const first_tdigest_index         = group_offsets[group_index];
+    auto const first_weight_index          = tdigest_offsets[first_tdigest_index];
     auto const relative_value_index        = value_index - first_weight_index;
     double const* group_cumulative_weights = cumulative_weights + first_weight_index;
 
@@ -284,15 +289,15 @@ struct scalar_group_info {
 // retrieve group info of centroid inputs by group index
 template <typename GroupOffsetsIter>
 struct centroid_group_info {
-  double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+  double const* cumulative_weights;  // cumulative weights of non-empty clusters
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
     // if there's no weights in this group of digests at all, return 0.
-    auto const group_start       = inner_offsets[outer_offsets[group_index]];
-    auto const group_end         = inner_offsets[outer_offsets[group_index + 1]];
+    auto const group_start       = tdigest_offsets[group_offsets[group_index]];
+    auto const group_end         = tdigest_offsets[group_offsets[group_index + 1]];
     auto const num_weights       = group_end - group_start;
     auto const last_weight_index = group_end - 1;
     return num_weights == 0
@@ -367,7 +372,6 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
  * @param has_nulls Whether or not the input contains nulls
- *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -661,6 +665,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
+/**
+ * @brief A functor which returns the cluster index within a group that the value at
+ * the given value index falls into.
+ */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param values_begin       Beginning of the range of input values.
- * @param values_end         End of the range of input values.
+ * @param centroids_begin    Beginning of the range of centroids.
+ * @param centroids_end      End of the range of centroids.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -750,7 +758,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) {
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -983,38 +993,54 @@ struct typed_reduce_tdigest {
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor to compute the number of clusters in each group.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupOffsetsIter>
-struct group_num_weights_func {
-  GroupOffsetsIter outer_offsets;
-  size_type const* inner_offsets;
+struct group_num_clusters_func {
+  GroupOffsetsIter group_offsets;
+  size_type const* tdigest_offsets;
 
   __device__ size_type operator()(size_type group_index)
   {
-    auto const tdigest_begin = outer_offsets[group_index];
-    auto const tdigest_end   = outer_offsets[group_index + 1];
-    return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    auto const tdigest_begin = group_offsets[group_index];
+    auto const tdigest_end   = group_offsets[group_index + 1];
+    return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin];
   }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Function to determine if a group is empty.
+ *
+ * Used in `merge_tdigests`.
+ */
 struct group_is_empty {
   __device__ bool operator()(size_type group_size) { return group_size == 0; }
 };
 
-// utility for merge_tdigests.
+/**
+ * @brief Functor that returns the grouping key for each tdigest cluster.
+ *
+ * Used in `merge_tdigests`.
+ */
 template <typename GroupLabelsIter>
 struct group_key_func {
   GroupLabelsIter group_labels;
-  size_type const* inner_offsets;
-  size_type num_inner_offsets;
+  size_type const* tdigest_offsets;
+  size_type num_tdigest_offsets;
 
+  /**
+   * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the
+   * `n`-th non-empty cluster.
+   */
   __device__ size_type operator()(size_type index)
   {
     // what -original- tdigest index this absolute index corresponds to
-    auto const iter = thrust::prev(
-      thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
-    auto const tdigest_index = thrust::distance(inner_offsets, iter);
+    auto const iter          = thrust::prev(thrust::upper_bound(
+      thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index));
+    auto const tdigest_index = thrust::distance(tdigest_offsets, iter);
 
     // what group index the original tdigest belongs to
     return group_labels[tdigest_index];
@@ -1040,8 +1066,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
 
   // each group represents a collection of tdigest columns. each row is 1 tdigest.
   // within each group, we want to sort all the centroids within all the tdigests
-  // in that group, using the means as the key. the "outer offsets" represent the indices of the
-  // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest.
+  // in that group, using the means as the key. the "group offsets" represent the indices of the
+  // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest.
   //
   //  rows
   //  ----        centroid 0 ---------
@@ -1054,12 +1080,12 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   //  tdigest 3   centroid 7
   //              centroid 8
   //  ----        centroid 9 --------
-  auto inner_offsets    = tdv.centroids().offsets();
+  auto tdigest_offsets  = tdv.centroids().offsets();
   auto centroid_offsets = cudf::detail::make_counting_transform_iterator(
     0,
     cuda::proclaim_return_type<size_type>(
-      [group_offsets, inner_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
-        size_type i) { return inner_offsets[group_offsets[i]]; }));
+      [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin<size_type>()] __device__(
+        size_type i) { return tdigest_offsets[group_offsets[i]]; }));
 
   // perform the sort using the means as the key
   size_t temp_size;
@@ -1091,9 +1117,34 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
   return {std::move(output_means), std::move(output_weights)};
 }
 
+/**
+ * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the
+ * outputs of multiple `typed_group_tdigest` calls, and merges them.
+ *
+ * A tdigest can be empty in the input, which means that there was no valid input data to generate
+ * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and
+ * `max` of 0.
+ *
+ * @param tdv input tdigests. The tdigests within this column are grouped by key.
+ * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `group_offsets`.
+ * @param group_offsets a device iterator of the offsets to the start of each group. A group is
+ * counted as one even when the cluster is empty in it. The offsets should have the same values as
+ * the ones in `h_group_offsets`.
+ * @param group_labels a device iterator of the the group label for each tdigest cluster including
+ * empty clusters.
+ * @param num_group_labels the number of unique group labels.
+ * @param num_groups the number of groups.
+ * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest.
+ * @param stream CUDA stream
+ * @param mr device memory resource
+ *
+ * @return A column containing the merged tdigests.
+ */
 template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_outer_offsets,
+                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1133,22 +1184,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                         thrust::equal_to{},  // key equality check
                         thrust::maximum{});
 
+  auto tdigest_offsets = tdv.centroids().offsets();
+
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
-  auto group_num_weights = cudf::detail::make_counting_transform_iterator(
+  auto group_num_clusters = cudf::detail::make_counting_transform_iterator(
     0,
-    group_num_weights_func<decltype(group_offsets)>{group_offsets,
-                                                    tdv.centroids().offsets().begin<size_type>()});
+    group_num_clusters_func<decltype(group_offsets)>{group_offsets,
+                                                     tdigest_offsets.begin<size_type>()});
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_min_col->mutable_view().begin<double>(),
                      merged_min_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
   thrust::replace_if(rmm::exec_policy(stream),
                      merged_max_col->mutable_view().begin<double>(),
                      merged_max_col->mutable_view().end<double>(),
-                     group_num_weights,
+                     group_num_clusters,
                      group_is_empty{},
                      0);
 
@@ -1166,14 +1219,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   // generate group keys for all centroids in the entire column
   rmm::device_uvector<size_type> group_keys(num_centroids, stream, temp_mr);
-  auto iter          = thrust::make_counting_iterator(0);
-  auto inner_offsets = tdv.centroids().offsets();
+  auto iter = thrust::make_counting_iterator(0);
   thrust::transform(rmm::exec_policy(stream),
                     iter,
                     iter + num_centroids,
                     group_keys.begin(),
                     group_key_func<decltype(group_labels)>{
-                      group_labels, inner_offsets.begin<size_type>(), inner_offsets.size()});
+                      group_labels, tdigest_offsets.begin<size_type>(), tdigest_offsets.size()});
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                 group_keys.begin(),
                                 group_keys.begin() + num_centroids,
@@ -1182,20 +1234,24 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
+  // TDigest merge takes the output of typed_group_tdigest as its input, which must not have
+  // any nulls.
+  auto const has_nulls = false;
+
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
     num_groups,
     nearest_value_centroid_weights<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     centroid_group_info<decltype(group_offsets)>{
-      cumulative_weights.begin(), group_offsets, inner_offsets.begin<size_type>()},
+      cumulative_weights.begin(), group_offsets, tdigest_offsets.begin<size_type>()},
     cumulative_centroid_weight<decltype(group_labels), decltype(group_offsets)>{
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
-    false,
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    has_nulls,
     stream,
     mr);
 
@@ -1212,13 +1268,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights.begin(),
       group_labels,
       group_offsets,
-      {inner_offsets.begin<size_type>(), static_cast<size_t>(inner_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    false,
+    has_nulls,
     stream,
     mr);
 }
@@ -1283,7 +1339,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1309,7 +1365,15 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr);
+  }
+
+  if (tdv.means().size() == 0) {
+    // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes
+    // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters
+    // are empty in the input. Let's skip all complex computation in the below, but just return
+    // an empty tdigest per group.
+    return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 0efb881eb3e..c0af27a1748 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -30,7 +30,7 @@ namespace cudf::detail {
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 };  // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   if (kind == host_memory_kind::PINNED) {
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
-
 }  // namespace cudf::detail
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index baa59026b07..4ae5d06b214 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
+
+std::unique_ptr<cudf::table> do_agg(
+  cudf::column_view key,
+  cudf::column_view val,
+  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
+{
+  std::vector<cudf::column_view> keys;
+  keys.push_back(key);
+  cudf::table_view const key_table(keys);
+
+  cudf::groupby::groupby gb(key_table);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  cudf::groupby::aggregation_request req;
+  req.values = val;
+  req.aggregations.push_back(make_agg());
+  requests.push_back(std::move(req));
+
+  auto result = gb.aggregate(std::move(requests));
+
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto&& c : result.first->release()) {
+    result_columns.push_back(std::move(c));
+  }
+
+  EXPECT_EQ(result.second.size(), 1);
+  EXPECT_EQ(result.second[0].results.size(), 1);
+  result_columns.push_back(std::move(result.second[0].results[0]));
+
+  return std::make_unique<cudf::table>(std::move(result_columns));
+}
+
+TEST_F(TDigestMergeTest, AllValuesAreNull)
+{
+  // The input must be sorted by the key.
+  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
+  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
+  auto const keys_view = cudf::column_view(keys);
+  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    // All values are null
+    return false;
+  });
+  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
+    val_elems, val_elems + keys_view.size(), val_valids};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
+  auto const expected_computed_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
+                                 compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
+  auto const expected_merged_vals =
+    cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
+}
+
+TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull)
+{
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> vals{{10.0, 20.0, {}, {}, 30.0},
+                                                      {true, true, false, false, true}};
+
+  auto const delta = 1000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2, 3}};
+
+  cudf::test::fixed_width_column_wrapper<double> expected_means{10, 20, 30};
+  cudf::test::fixed_width_column_wrapper<double> expected_weights{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_offsets{0, 1, 2, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> expected_mins{10.0, 20.0, 0.0, 30.0};
+  cudf::test::fixed_width_column_wrapper<double> expected_maxes{10.0, 20.0, 0.0, 30.0};
+  auto const expected_values =
+    cudf::tdigest::detail::make_tdigest_column(4,
+                                               std::make_unique<cudf::column>(expected_means),
+                                               std::make_unique<cudf::column>(expected_weights),
+                                               std::make_unique<cudf::column>(expected_offsets),
+                                               std::make_unique<cudf::column>(expected_mins),
+                                               std::make_unique<cudf::column>(expected_maxes),
+                                               cudf::get_default_stream(),
+                                               rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys},
+                                 merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view());
+}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 915717713df..37414eb3fba 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
new file mode 100644
index 00000000000..8e86b33b1a0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
@@ -0,0 +1,6 @@
+=============
+find_multiple
+=============
+
+.. automodule:: pylibcudf.strings.find_multiple
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 9b1a6b72a88..e73ea3370ec 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -9,10 +9,12 @@ strings
     contains
     extract
     find
+    find_multiple
     findall
     regex_flags
     regex_program
     repeat
     replace
     slice
+    split
     strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
new file mode 100644
index 00000000000..cba96e86f45
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
@@ -0,0 +1,6 @@
+=====
+split
+=====
+
+.. automodule:: pylibcudf.strings.split
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 376a6f8af97..a57ce29eb45 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,23 +1,12 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.char_types cimport (
-    all_characters_of_type as cpp_all_characters_of_type,
-    filter_characters_of_type as cpp_filter_characters_of_type,
-    string_character_types,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf.strings import char_types
 
 
 @acquire_spill_lock()
@@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     """
     Returns a Column of strings keeping only alphanumeric character types.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_column = char_types.filter_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALL_TYPES if keep
+        else char_types.StringCharacterTypes.ALPHANUM,
+        py_repl.device_value.c_value,
+        char_types.StringCharacterTypes.ALPHANUM if keep
+        else char_types.StringCharacterTypes.ALL_TYPES
     )
-
-    with nogil:
-        c_result = move(cpp_filter_characters_of_type(
-            source_view,
-            string_character_types.ALL_TYPES if keep
-            else string_character_types.ALPHANUM,
-            scalar_repl[0],
-            string_character_types.ALPHANUM if keep
-            else string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
     that contain only decimal characters -- those that can be used
     to extract base10 numbers.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DECIMAL,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DECIMAL,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -75,17 +48,12 @@ def is_alnum(Column source_strings):
 
     Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHANUM,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHANUM,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only alphabetic characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHA,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHA,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -113,17 +76,12 @@ def is_digit(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only decimal and digit characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DIGIT,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DIGIT,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
     that contain only numeric characters. These include digit and
     numeric characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.NUMERIC,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.NUMERIC,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -152,17 +105,12 @@ def is_upper(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only upper-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.UPPER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.UPPER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -171,17 +119,12 @@ def is_lower(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only lower-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.LOWER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.LOWER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -190,14 +133,9 @@ def is_space(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contains all characters which are spaces only.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.SPACE,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.SPACE,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index 1358f8e3c2c..39e0013769f 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.find_multiple cimport (
-    find_multiple as cpp_find_multiple,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
@@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings):
     Returns a column with character position values where each
     of the `target_strings` are found in each string of `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_find_multiple(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.find_multiple.find_multiple(
+        source_strings.to_pylibcudf(mode="read"),
+        target_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index a81fb18e752..5319addc41c 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,21 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.split.partition cimport (
-    partition as cpp_partition,
-    rpartition as cpp_rpartition,
-)
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,25 +14,11 @@ def partition(Column source_strings,
     Returns data by splitting the `source_strings`
     column at the first occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_partition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.partition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -53,22 +28,8 @@ def rpartition(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column at the last occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rpartition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.rpartition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index f481fea4c51..4ec6c7073d8 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,33 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.split.split cimport (
-    rsplit as cpp_rsplit,
-    rsplit_re as cpp_rsplit_re,
-    rsplit_record as cpp_rsplit_record,
-    rsplit_record_re as cpp_rsplit_record_re,
-    split as cpp_split,
-    split_re as cpp_split_re,
-    split_record as cpp_split_record,
-    split_record_re as cpp_split_record_re,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -39,26 +18,12 @@ def split(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -70,25 +35,12 @@ def split_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -100,26 +52,12 @@ def rsplit(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -160,24 +85,15 @@ def split_re(Column source_strings,
     Returns data by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 3fad91bbfc0..3ef478532c2 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,25 +1,12 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.translate cimport (
-    filter_characters as cpp_filter_characters,
-    filter_type,
-    translate as cpp_translate,
-)
-from pylibcudf.libcudf.types cimport char_utf8
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -29,30 +16,11 @@ def translate(Column source_strings,
     Translates individual characters within each string
     if present in the mapping_table.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    with nogil:
-        c_result = move(cpp_translate(source_view, c_mapping_table))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.translate.translate(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -64,44 +32,11 @@ def filter_characters(Column source_strings,
     Removes or keeps individual characters within each string
     using the provided mapping_table.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.translate.filter_characters(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+        plc.strings.translate.FilterType.KEEP
+        if keep else plc.strings.translate.FilterType.REMOVE,
+        py_repl.device_value.c_value
     )
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    cdef filter_type c_keep
-    if keep is True:
-        c_keep = filter_type.KEEP
-    else:
-        c_keep = filter_type.REMOVE
-
-    with nogil:
-        c_result = move(cpp_filter_characters(
-            source_view,
-            c_mapping_table,
-            c_keep,
-            scalar_repl[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4463e3280df..da422db5eae 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2546,9 +2546,9 @@ def split(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.split_re(self._column, pat, n)
+                    data = libstrings.split_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.split(
+                    data = libstrings.split(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2719,9 +2719,9 @@ def rsplit(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.rsplit_re(self._column, pat, n)
+                    data = libstrings.rsplit_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.rsplit(
+                    data = libstrings.rsplit(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index abf4357f862..b8b4343173e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
index 5d54c1c3593..76afe047e8c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
@@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         CASE_TYPES
         ALL_TYPES
 
-cdef extern from "cudf/strings/char_types/char_types.hpp" \
-        namespace "cudf::strings" nogil:
-
     cdef unique_ptr[column] all_characters_of_type(
         column_view source_strings,
         string_character_types types,
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 0491644a10a..3d048c1f50b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] find_multiple(
-        column_view source_strings,
+        column_view input,
         column_view targets) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 4162e886a7d..4299cf62e99 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] partition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
 
     cdef unique_ptr[table] rpartition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 3046149aebb..a22a79fc7d7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
@@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
index 85fa719128a..9fd24f2987b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
@@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] translate(
-        column_view source_strings,
+        column_view input,
         vector[pair[char_utf8, char_utf8]] chars_table) except +
 
-    ctypedef enum filter_type:
-        KEEP 'cudf::strings::filter_type::KEEP',
-        REMOVE 'cudf::strings::filter_type::REMOVE'
+    cpdef enum class filter_type(bool):
+        KEEP
+        REMOVE
 
     cdef unique_ptr[column] filter_characters(
-        column_view source_strings,
-        vector[pair[char_utf8, char_utf8]] chars_table,
-        filter_type keep,
+        column_view input,
+        vector[pair[char_utf8, char_utf8]] characters_to_filter,
+        filter_type keep_characters,
         string_scalar replacement) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 142bc124ca2..d92f806efbe 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     contains.pyx
     extract.pyx
     find.pyx
+    find_multiple.pyx
     findall.pyx
     regex_flags.pyx
     regex_program.pyx
@@ -28,6 +29,7 @@ set(cython_sources
     side_type.pyx
     slice.pyx
     strip.pyx
+    translate.pyx
 )
 
 set(linked_libraries cudf::cudf)
@@ -38,3 +40,4 @@ rapids_cython_create_modules(
 )
 
 add_subdirectory(convert)
+add_subdirectory(split)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d8afccc7336..788e2c99ab1 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -9,12 +9,15 @@ from . cimport (
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
     replace,
     slice,
+    split,
     strip,
+    translate,
 )
 from .side_type cimport side_type
 
@@ -33,5 +36,7 @@ __all__ = [
     "replace",
     "slice",
     "strip",
+    "split",
     "side_type",
+    "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 22452812e42..bcaeb073d0b 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -9,13 +9,16 @@
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
     repeat,
     replace,
     slice,
+    split,
     strip,
+    translate,
 )
 from .side_type import SideType
 
@@ -34,5 +37,7 @@
     "replace",
     "slice",
     "strip",
+    "split",
     "SideType",
+    "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd
index ad4e4cf61d8..f9f7d244212 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd
@@ -1,3 +1,19 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from pylibcudf.column cimport Column
 from pylibcudf.libcudf.strings.char_types cimport string_character_types
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column all_characters_of_type(
+    Column source_strings,
+    string_character_types types,
+    string_character_types verify_types
+)
+
+cpdef Column filter_characters_of_type(
+    Column source_strings,
+    string_character_types types_to_remove,
+    Scalar replacement,
+    string_character_types types_to_keep
+)
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index e7621fb4d84..6a24d79bc4b 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -1,4 +1,93 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings cimport char_types as cpp_char_types
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
 from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
+
+
+cpdef Column all_characters_of_type(
+    Column source_strings,
+    string_character_types types,
+    string_character_types verify_types
+):
+    """
+    Identifies strings where all characters match the specified type.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Strings instance for this operation
+    types : StringCharacterTypes
+        The character types to check in each string
+    verify_types : StringCharacterTypes
+        Only verify against these character types.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_char_types.all_characters_of_type(
+                source_strings.view(),
+                types,
+                verify_types,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column filter_characters_of_type(
+    Column source_strings,
+    string_character_types types_to_remove,
+    Scalar replacement,
+    string_character_types types_to_keep
+):
+    """
+    Filter specific character types from a column of strings.
+
+    Parameters
+    ----------
+    source_strings : Column
+        Strings instance for this operation
+    types_to_remove : StringCharacterTypes
+        The character types to check in each string.
+    replacement : Scalar
+        The replacement character to use when removing characters
+    types_to_keep : StringCharacterTypes
+        Default `ALL_TYPES` means all characters of `types_to_remove`
+        will be filtered.
+
+    Returns
+    -------
+    Column
+        New column with the specified characters filtered out and
+        replaced with the specified replacement string.
+    """
+    cdef const string_scalar* c_replacement = <const string_scalar*>(
+        replacement.c_obj.get()
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_char_types.filter_characters_of_type(
+                source_strings.view(),
+                types_to_remove,
+                dereference(c_replacement),
+                types_to_keep,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
new file mode 100644
index 00000000000..b7b3aefa336
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column find_multiple(Column input, Column targets)
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
new file mode 100644
index 00000000000..413fc1cb79d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
+
+
+cpdef Column find_multiple(Column input, Column targets):
+    """
+    Returns a lists column with character position values where each
+    of the target strings are found in each string.
+
+    For details, see :cpp:func:`cudf::strings::find_multiple`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    targets : Column
+        Strings to search for in each string
+
+    Returns
+    -------
+    Column
+        Lists column with character position values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_find_multiple.find_multiple(
+                input.view(),
+                targets.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
new file mode 100644
index 00000000000..8f544f6f537
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources partition.pyx split.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
new file mode 100644
index 00000000000..72086e57d9f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . cimport partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py
new file mode 100644
index 00000000000..2033e5e275b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . import partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
new file mode 100644
index 00000000000..c18257a4787
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+
+cpdef Table partition(Column input, Scalar delimiter=*)
+
+cpdef Table rpartition(Column input, Scalar delimiter=*)
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
new file mode 100644
index 00000000000..ecc959e65b0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.split cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table partition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::partition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.partition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rpartition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rpartition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+       New strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.rpartition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd
new file mode 100644
index 00000000000..355a1874298
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit)
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
new file mode 100644
index 00000000000..a7d7f39fc47
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -0,0 +1,326 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split cimport split as cpp_split
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::split`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns.
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings.
+
+    For details, see :cpp:func:`cudf::strings::split_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings starting
+    from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string starting from
+    the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string starting from the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd
new file mode 100644
index 00000000000..0ca746801d7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.translate cimport filter_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column translate(Column input, dict chars_table)
+
+cpdef Column filter_characters(
+    Column input,
+    dict characters_to_filter,
+    filter_type keep_characters,
+    Scalar replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
new file mode 100644
index 00000000000..a62c7ec4528
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -0,0 +1,122 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings cimport translate as cpp_translate
+from pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+from pylibcudf.libcudf.strings.translate import \
+    filter_type as FilterType  # no-cython-lint
+
+
+cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
+    """
+    Convert str.maketrans table to cudf compatible table.
+    """
+    cdef int table_size = len(table)
+    cdef vector[pair[char_utf8, char_utf8]] c_table
+
+    c_table.reserve(table_size)
+    for key, value in table.items():
+        if isinstance(value, int):
+            value = chr(value)
+        if isinstance(value, str):
+            value = int.from_bytes(value.encode(), byteorder='big')
+        if isinstance(key, int):
+            key = chr(key)
+        if isinstance(key, str):
+            key = int.from_bytes(key.encode(), byteorder='big')
+        c_table.push_back((key, value))
+
+    return c_table
+
+
+cpdef Column translate(Column input, dict chars_table):
+    """
+    Translates individual characters within each string.
+
+    For details, see :cpp:func:`cudf::strings::translate`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    chars_table : dict
+        Table of UTF-8 character mappings
+
+    Returns
+    -------
+    Column
+        New column with padded strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table(
+        chars_table
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_translate.translate(
+                input.view(),
+                c_chars_table
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_characters(
+    Column input,
+    dict characters_to_filter,
+    filter_type keep_characters,
+    Scalar replacement
+):
+    """
+    Removes ranges of characters from each string in a strings column.
+
+    For details, see :cpp:func:`cudf::strings::filter_characters`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    characters_to_filter : dict
+        Table of character ranges to filter on
+
+    keep_characters : FilterType
+        If true, the `characters_to_filter` are retained
+        and all other characters are removed.
+
+    replacement : Scalar
+        Replacement string for each character removed.
+
+    Returns
+    -------
+    Column
+        New column with filtered strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table(
+        characters_to_filter
+    )
+    cdef const string_scalar* c_replacement = <const string_scalar*>(
+        replacement.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_translate.filter_characters(
+                input.view(),
+                c_characters_to_filter,
+                keep_characters,
+                dereference(c_replacement),
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
new file mode 100644
index 00000000000..bcd030c019e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_all_characters_of_type():
+    pa_array = pa.array(["1", "A"])
+    result = plc.strings.char_types.all_characters_of_type(
+        plc.interop.from_arrow(pa_array),
+        plc.strings.char_types.StringCharacterTypes.ALPHA,
+        plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
+    )
+    expected = pc.utf8_is_alpha(pa_array)
+    assert_column_eq(result, expected)
+
+
+def test_filter_characters_of_type():
+    pa_array = pa.array(["=A="])
+    result = plc.strings.char_types.filter_characters_of_type(
+        plc.interop.from_arrow(pa_array),
+        plc.strings.char_types.StringCharacterTypes.ALPHANUM,
+        plc.interop.from_arrow(pa.scalar(" ")),
+        plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
+    )
+    expected = pc.replace_substring(pa_array, "A", " ")
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
new file mode 100644
index 00000000000..d6b37a388f0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_find_multiple():
+    arr = pa.array(["abc", "def"])
+    targets = pa.array(["a", "c", "e"])
+    result = plc.strings.find_multiple.find_multiple(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array(
+        [
+            [elem.find(target) for target in targets.to_pylist()]
+            for elem in arr.to_pylist()
+        ],
+        type=pa.list_(pa.int32()),
+    )
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
new file mode 100644
index 00000000000..80cae8d1c6b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_arr = pa.array(["ab_cd", "def_g_h", None])
+    plc_column = plc.interop.from_arrow(pa_arr)
+    return pa_arr, plc_column
+
+
+def test_partition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.partition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "g_h", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rpartition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.rpartition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def_g", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "h", None],
+        }
+    )
+    assert_table_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
new file mode 100644
index 00000000000..2aeffac8209
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_array = pa.array(["a_b_c", "d-e-f", None])
+    plc_column = plc.interop.from_arrow(pa_array)
+    return pa_array, plc_column
+
+
+@pytest.fixture
+def delimiter():
+    delimiter = "_"
+    plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter))
+    return delimiter, plc_delimiter
+
+
+@pytest.fixture
+def re_delimiter():
+    return "[_-]"
+
+
+def test_split(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.split(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a", "d-e-f", None],
+            "b": ["b_c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e-f", None],
+            "b": ["c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_split_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.split_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a", "d", None],
+            "b": ["b_c", "e-f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.rsplit_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e", None],
+            "b": ["c", "f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.split_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.rsplit_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        -1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter)
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
new file mode 100644
index 00000000000..2ae893e69fb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_data_col = pa.array(
+        ["aa", "bbb", "cccc", "abcd", None],
+        type=pa.string(),
+    )
+    return pa_data_col, plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture
+def trans_table():
+    return str.maketrans("abd", "A Q")
+
+
+def test_translate(data_col, trans_table):
+    pa_array, plc_col = data_col
+    result = plc.strings.translate.translate(plc_col, trans_table)
+    expected = pa.array(
+        [
+            val.translate(trans_table) if isinstance(val, str) else None
+            for val in pa_array.to_pylist()
+        ]
+    )
+    assert_column_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "keep",
+    [
+        plc.strings.translate.FilterType.KEEP,
+        plc.strings.translate.FilterType.REMOVE,
+    ],
+)
+def test_filter_characters(data_col, trans_table, keep):
+    pa_array, plc_col = data_col
+    result = plc.strings.translate.filter_characters(
+        plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*"))
+    )
+    exp_data = []
+    flat_trans = set(trans_table.keys()).union(trans_table.values())
+    for val in pa_array.to_pylist():
+        if not isinstance(val, str):
+            exp_data.append(val)
+        else:
+            new_val = ""
+            for ch in val:
+                if (
+                    ch in flat_trans
+                    and keep == plc.strings.translate.FilterType.KEEP
+                ):
+                    new_val += ch
+                elif (
+                    ch not in flat_trans
+                    and keep == plc.strings.translate.FilterType.REMOVE
+                ):
+                    new_val += ch
+                else:
+                    new_val += "*"
+            exp_data.append(new_val)
+    expected = pa.array(exp_data)
+    assert_column_eq(expected, result)