diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 55399d0371a..f5bcdc62604 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -24,14 +24,17 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep
 
-rapids-logger "Install pylibcudf"
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+rapids-logger "Install libcudf, pylibcudf and cudf_polars"
+python -m pip install \
+    -v \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 136f43ee706..f7a5dd2f2fb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 110b4557840..b8a53cd8bd9 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 65a44532cf1..f44f26e4d2c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,16 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +37,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::compute_column(table, expression_tree_root);
-  }
-
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index fa98d9e601a..7d267a88764 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
-#include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -33,13 +32,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns>
-class BINARYOP : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns>
-static void BM_binaryop_transform(benchmark::State& state)
+static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
@@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state)
     cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
   cudf::table_view table{*source_table};
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
     auto const op               = cudf::binary_operator::ADD;
     auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
@@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
-  }
-
-  // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
+  });
 }
 
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
-  BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns)     \
-  (::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
+                                                                                      \
+  static void name(::nvbench::state& st)                                              \
+  {                                                                                   \
+    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+  }                                                                                   \
+  NVBENCH_BENCH(name)                                                                 \
+    .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index 7086a61c7c5..bc0ff69bce9 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -15,20 +15,18 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-class COMPILED_BINARYOP : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   // Call once for hot cache.
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-
   // use number of bytes read and written to global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
+  state.add_global_memory_reads<TypeLhs>(table_size);
+  state.add_global_memory_reads<TypeRhs>(table_size);
+  state.add_global_memory_reads<TypeOut>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
 }
 
+#define BM_STRINGIFY(a) #a
+
 // TODO tparam boolean for null.
-#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)           \
-  BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name)                             \
-  (::benchmark::State & st)                                               \
-  {                                                                       \
-    BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                           \
-    ->Unit(benchmark::kMicrosecond)                                       \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)               \
+  static void name(::nvbench::state& st)                                      \
+  {                                                                           \
+    ::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
+  }                                                                           \
+  NVBENCH_BENCH(name)                                                         \
+    .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
+    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 1b6a1730161..33b1b45fb44 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -16,7 +16,11 @@
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
+  set(export_args)
+  if(CUDF_EXPORT_NVCOMP)
+    set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+  endif()
+  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
 
   # Per-thread default stream
   if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM)
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index b12fbe39a57..dc14802adc1 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -86,14 +86,28 @@ class datasource {
   /**
    * @brief Creates a source from a file path.
    *
+   * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the
+   * `datasource` implementation about the expected range of the data that will be read. The
+   * implementation may use these hints to optimize the read operation. These parameters are usually
+   * based on the byte range option. In this case, `min_size_estimate` should be no greater than the
+   * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can
+   * include padding after the byte range, to include additional data that may be needed for
+   * processing.
+   *
+   @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate
+   *
    * @param[in] filepath Path to the file to use
-   * @param[in] offset Bytes from the start of the file (the default is zero)
-   * @param[in] size Bytes from the offset; use zero for entire file (the default is zero)
+   * @param[in] offset Starting byte offset from which data will be read (the default is zero)
+   * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
+   * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(std::string const& filepath,
-                                            size_t offset = 0,
-                                            size_t size   = 0);
+                                            size_t offset            = 0,
+                                            size_t max_size_estimate = 0,
+                                            size_t min_size_estimate = 0);
 
   /**
    * @brief Creates a source from a host memory buffer.
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index de8eea9e99b..5a060902eb2 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -122,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 namespace {
 
 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
-                                                                    size_t range_offset = 0,
-                                                                    size_t range_size   = 0)
+                                                                    size_t offset            = 0,
+                                                                    size_t max_size_estimate = 0,
+                                                                    size_t min_size_estimate = 0)
 {
   switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
       for (auto const& filepath : info.filepaths()) {
-        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+        sources.emplace_back(
+          cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate));
       }
       return sources;
     }
@@ -211,7 +213,8 @@ table_with_metadata read_json(json_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   return json::detail::read_json(datasources, options, stream, mr);
 }
@@ -238,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index e4313eba454..0be976b6144 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -32,6 +32,7 @@
 #include <unistd.h>
 
 #include <unordered_map>
+#include <vector>
 
 namespace cudf {
 namespace io {
@@ -54,6 +55,30 @@ class file_source : public datasource {
     }
   }
 
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    ssize_t const read_size = std::min(size, _file.size() - offset);
+
+    std::vector<uint8_t> v(read_size);
+    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
+    return buffer::create(std::move(v));
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    auto const read_size = std::min(size, _file.size() - offset);
+
+    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
+                 "read failed");
+    return read_size;
+  }
+
   ~file_source() override = default;
 
   [[nodiscard]] bool supports_device_read() const override
@@ -138,40 +163,63 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(char const* filepath, size_t offset, size_t size)
+  explicit memory_mapped_source(char const* filepath,
+                                size_t offset,
+                                size_t max_size_estimate,
+                                size_t min_size_estimate)
     : file_source(filepath)
   {
     if (_file.size() != 0) {
-      map(_file.desc(), offset, size);
-      register_mmap_buffer();
+      // Memory mapping is not exclusive, so we can include the whole region we expect to read
+      map(_file.desc(), offset, max_size_estimate);
+      // Buffer registration is exclusive (can't overlap with other registered buffers) so we
+      // register the lower estimate; this avoids issues when reading adjacent ranges from the same
+      // file from multiple threads
+      register_mmap_buffer(offset, min_size_estimate);
     }
   }
 
   ~memory_mapped_source() override
   {
     if (_map_addr != nullptr) {
-      munmap(_map_addr, _map_size);
+      unmap();
       unregister_mmap_buffer();
     }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
+
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size);
+    }
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    // If the requested range is only partially within the registered region, copy to a new
+    // host buffer to make the data safe to copy to the device
+    if (_reg_addr != nullptr and
+        (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) {
+      auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
+
+      return std::make_unique<owning_buffer<std::vector<uint8_t>>>(
+        std::vector<uint8_t>(src, src + read_size));
+    }
 
     return std::make_unique<non_owning_buffer>(
-      static_cast<uint8_t*>(_map_addr) + (offset - _map_offset), read_size);
+      static_cast<uint8_t*>(_map_addr) + offset - _map_offset, read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size, dst);
+    }
 
     auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
@@ -184,16 +232,18 @@ class memory_mapped_source : public file_source {
    *
    * Fixes nvbugs/4215160
    */
-  void register_mmap_buffer()
+  void register_mmap_buffer(size_t offset, size_t size)
   {
-    if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) {
-      return;
-    }
+    if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; }
 
-    auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault);
-    if (result == cudaSuccess) {
-      _is_map_registered = true;
-    } else {
+    // Registered region must be within the mapped region
+    _reg_offset = std::max(offset, _map_offset);
+    _reg_size   = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset);
+
+    _reg_addr         = static_cast<std::byte*>(_map_addr) - _map_offset + _reg_offset;
+    auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly);
+    if (result != cudaSuccess) {
+      _reg_addr = nullptr;
       CUDF_LOG_WARN("cudaHostRegister failed with {} ({})",
                     static_cast<int>(result),
                     cudaGetErrorString(result));
@@ -205,10 +255,12 @@ class memory_mapped_source : public file_source {
    */
   void unregister_mmap_buffer()
   {
-    if (not _is_map_registered) { return; }
+    if (_reg_addr == nullptr) { return; }
 
-    auto const result = cudaHostUnregister(_map_addr);
-    if (result != cudaSuccess) {
+    auto const result = cudaHostUnregister(_reg_addr);
+    if (result == cudaSuccess) {
+      _reg_addr = nullptr;
+    } else {
       CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})",
                     static_cast<int>(result),
                     cudaGetErrorString(result));
@@ -226,52 +278,30 @@ class memory_mapped_source : public file_source {
 
     // Size for `mmap()` needs to include the page padding
     _map_size = size + (offset - _map_offset);
+    if (_map_size == 0) { return; }
 
     // Check if accessing a region within already mapped area
     _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
     CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
- private:
-  size_t _map_size        = 0;
-  size_t _map_offset      = 0;
-  void* _map_addr         = nullptr;
-  bool _is_map_registered = false;
-};
-
-/**
- * @brief Implementation class for reading from a file using `read` calls
- *
- * Potentially faster than `memory_mapped_source` when only a small portion of the file is read
- * through the host.
- */
-class direct_read_source : public file_source {
- public:
-  explicit direct_read_source(char const* filepath) : file_source(filepath) {}
-
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  void unmap()
   {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    ssize_t const read_size = std::min(size, _file.size() - offset);
-
-    std::vector<uint8_t> v(read_size);
-    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
-    return buffer::create(std::move(v));
+    if (_map_addr != nullptr) {
+      auto const result = munmap(_map_addr, _map_size);
+      if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); }
+      _map_addr = nullptr;
+    }
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    auto const read_size = std::min(size, _file.size() - offset);
+ private:
+  size_t _map_offset = 0;
+  size_t _map_size   = 0;
+  void* _map_addr    = nullptr;
 
-    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
-                 "read failed");
-    return read_size;
-  }
+  size_t _reg_offset = 0;
+  size_t _reg_size   = 0;
+  void* _reg_addr    = nullptr;
 };
 
 /**
@@ -431,16 +461,21 @@ class user_datasource_wrapper : public datasource {
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
-                                               size_t size)
+                                               size_t max_size_estimate,
+                                               size_t min_size_estimate)
 {
+  CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate,
+               "Invalid min/max size estimates for datasource creation");
+
 #ifdef CUFILE_FOUND
   if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
-    return std::make_unique<direct_read_source>(filepath.c_str());
+    return std::make_unique<file_source>(filepath.c_str());
   }
 #endif
   // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, size);
+  return std::make_unique<memory_mapped_source>(
+    filepath.c_str(), offset, max_size_estimate, min_size_estimate);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index dc14824d834..0028dd946e3 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected);
 }
 
+void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs)
+{
+  ASSERT_EQ(lhs->size(), rhs->size());
+  EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size()));
+}
+
+TEST_F(CsvReaderTest, OutOfMapBoundsReads)
+{
+  // write a lot of data into a file
+  auto filepath        = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv";
+  auto const num_rows  = 1 << 20;
+  auto const row       = std::string{"0,1,2,3,4,5,6,7,8,9\n"};
+  auto const file_size = num_rows * row.size();
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (size_t i = 0; i < num_rows; ++i) {
+      outfile << row;
+    }
+  }
+
+  // Only memory map the middle of the file
+  auto source         = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4);
+  auto full_source    = cudf::io::datasource::create(filepath);
+  auto const all_data = source->host_read(0, file_size);
+  auto ref_data       = full_source->host_read(0, file_size);
+  expect_buffers_equal(ref_data.get(), all_data.get());
+
+  auto const start_data = source->host_read(file_size / 2, file_size / 2);
+  expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(),
+                       start_data.get());
+
+  auto const end_data = source->host_read(0, file_size / 2 + 512);
+  expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 2b208e2e021..5f9a04d3cee 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -41,6 +41,9 @@ set(BUILD_TESTS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
 set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(CUDF_EXPORT_NVCOMP OFF)
+endif()
 set(CUDA_STATIC_RUNTIME ON)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)