diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index a7db972d39f..cd4d3ca964b 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -75,5 +75,5 @@ void bench_case(nvbench::state& state)
 NVBENCH_BENCH(bench_case)
   .set_name("case")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 77c014301ba..c1688d20791 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -237,13 +238,16 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
+  // each thread processes 4 bytes
   size_type size = 0;
-  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-    auto const chr = str_ptr[i];
-    if (is_utf8_continuation_char(chr)) { continue; }
-    char_utf8 u8 = 0;
-    to_char_utf8(str_ptr + i, u8);
-    size += converter.process_character(u8);
+  for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) {
+    for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) {
+      auto const chr = str_ptr[j];
+      if (is_utf8_continuation_char(chr)) { continue; }
+      char_utf8 u8 = 0;
+      to_char_utf8(str_ptr + j, u8);
+      size += converter.process_character(u8);
+    }
   }
   // this is slightly faster than using the cub::warp_reduce
   if (size > 0) {
@@ -260,6 +264,41 @@ struct ascii_converter_fn {
   __device__ char operator()(char chr) { return converter.process_ascii(chr); }
 };
 
+constexpr int64_t block_size       = 512;
+constexpr int64_t bytes_per_thread = 8;
+
+/**
+ * @brief Checks the chars data for any multibyte characters
+ *
+ * The output count is not accurate but it is only checked for > 0.
+ */
+CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
+                                       int64_t first_offset,
+                                       int64_t last_offset,
+                                       int64_t* d_output)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  // read only every 2nd byte; all bytes in a multibyte char have high bit set
+  auto const byte_idx = (static_cast<int64_t>(idx) * bytes_per_thread) + first_offset;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  // each thread processes 8 bytes (only 4 need to be checked)
+  int64_t mb_count = 0;
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) {
+    u_char const chr = static_cast<u_char>(d_input_chars[i]);
+    mb_count += ((chr & 0x80) > 0);
+  }
+  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
+
+  if ((lane_idx == 0) && (mb_total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Utility method for converting upper and lower case characters
  * in a strings column
@@ -289,7 +328,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset =
     cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
-  auto const chars_size = last_offset - first_offset;
+  auto const chars_size  = last_offset - first_offset;
+  auto const input_chars = input.chars_begin(stream);
 
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
@@ -306,16 +346,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // Check if the input contains any multi-byte characters.
   // This check incurs ~20% performance hit for smaller strings and so we only use it
-  // after the threshold check above. The check makes very little impact for larger strings
+  // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
-  bool const multi_byte_chars =
-    thrust::count_if(rmm::exec_policy(stream),
-                     input.chars_begin(stream),
-                     input.chars_end(stream),
-                     cuda::proclaim_return_type<bool>(
-                       [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
-  if (!multi_byte_chars) {
+  rmm::device_scalar<int64_t> mb_count(0, stream);
+  // cudf::detail::grid_1d is limited to size_type elements
+  auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
+  // we only need to check every other byte since either will contain high bit
+  has_multibytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    input_chars, first_offset, last_offset, mb_count.data());
+  if (mb_count.value(stream) == 0) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
     auto result  = std::make_unique<column>(input.parent(), stream, mr);
     auto d_chars = result->mutable_view().head<char>();
@@ -329,21 +368,21 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
   auto [offsets, bytes] = [&] {
     rmm::device_uvector<size_type> sizes(input.size(), stream);
-    constexpr int block_size = 512;
-    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
-    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    // cudf::detail::grid_1d is limited to size_type threads
+    auto const num_blocks = util::div_rounding_up_safe(
+      static_cast<int64_t>(input.size()) * cudf::detail::warp_size, block_size);
+    count_bytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
       ccfn, *d_strings, sizes.data());
     // convert sizes to offsets
     return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
   }();
 
   // build sub-offsets
-  auto const input_chars = input.chars_begin(stream);
-  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
-  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets     = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
   {
-    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
-    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
     thrust::transform(rmm::exec_policy_nosync(stream),
                       count_itr,
                       count_itr + sub_count,
@@ -359,6 +398,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                   sub_offsets.begin(),
                   sub_offsets.end(),
                   tmp_offsets.begin());
+    stream.synchronize();  // protect against destruction of sub_offsets
   }
 
   // run case conversion over the new sub-strings
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 5daacbdc2fa..7622e39e735 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -265,15 +265,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     // Use a heuristic to guess when the fused kernel will be faster than memcpy
     if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) {
       // Use single kernel launch to copy chars columns
-      constexpr size_type block_size{256};
-      cudf::detail::grid_1d config(total_bytes, block_size);
-      auto const kernel = fused_concatenate_string_chars_kernel;
-      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        d_views,
-        d_partition_offsets.data(),
-        static_cast<size_type>(columns.size()),
-        total_bytes,
-        d_new_chars);
+      constexpr size_t block_size{256};
+      // cudf::detail::grid_1d limited to size_type elements
+      auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size);
+      auto const kernel     = fused_concatenate_string_chars_kernel;
+      kernel<<<num_blocks, block_size, 0, stream.value()>>>(d_views,
+                                                            d_partition_offsets.data(),
+                                                            static_cast<size_type>(columns.size()),
+                                                            total_bytes,
+                                                            d_new_chars);
     } else {
       // Memcpy each input chars column (more efficient for very large strings)
       for (auto column = columns.begin(); column != columns.end(); ++column) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e779e1d1410..d3eae8011d8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -572,9 +572,10 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST
+  large_strings/concatenate_tests.cpp
+  large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
   GPUS 1
diff --git a/cpp/tests/large_strings/case_tests.cpp b/cpp/tests/large_strings/case_tests.cpp
new file mode 100644
index 00000000000..e56d984421a
--- /dev/null
+++ b/cpp/tests/large_strings/case_tests.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/case.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct CaseTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(CaseTest, ToLower)
+{
+  auto const wide = this->wide_column();
+  auto input      = cudf::concatenate(std::vector<cudf::column_view>(120000, wide));  // 230MB
+  auto expected   = cudf::strings::to_lower(cudf::strings_column_view(input->view()));
+
+  int const multiplier = 12;
+  std::vector<cudf::column_view> input_cols(multiplier, input->view());
+  std::vector<cudf::size_type> splits;
+  std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable {
+    return input->view().size() * (n++);
+  });
+
+  auto large_input = cudf::concatenate(input_cols);  // 2700MB > 2GB
+  auto const sv    = cudf::strings_column_view(large_input->view());
+  auto result      = cudf::strings::to_lower(sv);
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view());
+  }
+}
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
index aa445bf761b..89be2c307bf 100644
--- a/cpp/tests/large_strings/concatenate_tests.cpp
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -63,3 +63,16 @@ TEST_F(ConcatenateTest, ConcatenateVertical)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
   }
 }
+
+TEST_F(ConcatenateTest, ManyColumns)
+{
+  auto input           = this->wide_column();
+  auto view            = cudf::column_view(input);
+  int const multiplier = 1200000;
+  std::vector<cudf::column_view> input_cols(multiplier, view);  // 2500MB > 2GB
+  // this tests a unique path through the code
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}