rapidsai · rjzamora · Jun 13, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
@@ -53,6 +53,7 @@ def read_csv(
     use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
+    prefetch_read_ahead=None,
 ):
     """{docstring}"""
 
@@ -81,9 +82,31 @@ def read_csv(
             "`read_csv` does not yet support reading multiple files"
         )
 
+    # Extract filesystem up front
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=storage_options
+    )
+
+    # Prefetch remote data if possible
+    if fs and paths and not use_python_file_object:
+        filepath_or_buffer, info = ioutils.prefetch_remote_buffers(
+            paths,
+            fs,
+            bytes_per_thread=bytes_per_thread,
+            prefetcher="contiguous",
+            prefetcher_options={
+                "byte_range": byte_range,
+                "read_ahead": prefetch_read_ahead,
+            },
+        )
+        assert len(filepath_or_buffer) == 1
+        filepath_or_buffer = filepath_or_buffer[0]
+        byte_range = info.get("byte_range", byte_range)
+
     filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
+        fs=fs,
         iotypes=(BytesIO, StringIO, NativeFile),
         use_python_file_object=use_python_file_object,
         storage_options=storage_options,

@@ -28,6 +28,7 @@ def read_json(
     mixed_types_as_string=False,
     prune_columns=False,
     on_bad_lines="error",
+    prefetch_read_ahead=None,
     *args,
     **kwargs,
 ):
@@ -67,16 +68,30 @@ def read_json(
         if not is_list_like(path_or_buf):
             path_or_buf = [path_or_buf]
 
+        # Extract filesystem up front
+        fs, paths = ioutils._get_filesystem_and_paths(
+            path_or_data=path_or_buf, storage_options=storage_options
+        )
+
+        # Prefetch remote data if possible
+        if fs and paths:
+            path_or_buf, info = ioutils.prefetch_remote_buffers(
+                paths,
+                fs,
+                expand_paths="*.json",
+                prefetcher="contiguous",
+                prefetcher_options={
+                    "byte_range": byte_range,
+                    "read_ahead": prefetch_read_ahead,
+                },
+            )
+            byte_range = info.get("byte_range", byte_range)
+
         filepaths_or_buffers = []
         for source in path_or_buf:
             if ioutils.is_directory(
-                path_or_data=source, storage_options=storage_options
+                path_or_data=source, storage_options=storage_options, fs=fs
             ):
-                fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None,
-                    path=source,
-                    storage_options=storage_options,
-                )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
 

@@ -320,25 +320,37 @@ def read_orc(
                 "A list of stripes must be provided for each input source"
             )
 
+    # Extract filesystem up front
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=storage_options
+    )
+
+    # Prefetch remote data if possible
+    if fs and paths and not use_python_file_object:
+        # TODO: Add prefetcher for partial IO
+        filepath_or_buffer, _ = ioutils.prefetch_remote_buffers(
+            paths,
+            fs,
+            bytes_per_thread=bytes_per_thread,
+            expand_paths="*.orc",
+            prefetcher="contiguous",
+        )
+
     filepaths_or_buffers = []
     have_nativefile = any(
         isinstance(source, pa.NativeFile) for source in filepath_or_buffer
     )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
-            path_or_data=source, storage_options=storage_options
+            path_or_data=source, storage_options=storage_options, fs=fs
         ):
-            fs = ioutils._ensure_filesystem(
-                passed_filesystem=None,
-                path=source,
-                storage_options=storage_options,
-            )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
 
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
+            fs=fs,
             use_python_file_object=use_python_file_object,
             storage_options=storage_options,
             bytes_per_thread=bytes_per_thread,

@@ -341,8 +341,18 @@ def read_parquet_metadata(filepath_or_buffer):
         path_or_data=filepath_or_buffer, storage_options=None
     )
 
-    # Check if filepath or buffer
-    filepath_or_buffer = paths if paths else filepath_or_buffer
+    if fs and paths:
+        filepath_or_buffer, _ = ioutils.prefetch_remote_buffers(
+            paths,
+            fs,
+            prefetcher="parquet",
+            prefetcher_options={
+                "columns": [],
+                "row_groups": [],
+            },
+        )
+    else:
+        filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # List of filepaths or buffers
     filepaths_or_buffers = []
@@ -609,7 +619,22 @@ def read_parquet(
             categorical_partitions=categorical_partitions,
             dataset_kwargs=dataset_kwargs,
         )
-    filepath_or_buffer = paths if paths else filepath_or_buffer
+
+    # Prefetch remote data if possible
+    if fs and paths and not use_python_file_object:
+        filepath_or_buffer, _ = ioutils.prefetch_remote_buffers(
+            paths,
+            fs,
+            bytes_per_thread=bytes_per_thread,
+            prefetcher="parquet",
+            prefetcher_options={
+                "columns": columns,
+                # All paths must have the same row-group selection
+                "row_groups": row_groups[0] if row_groups else None,
+            },
+        )
+    else:
+        filepath_or_buffer = paths if paths else filepath_or_buffer
 
     filepaths_or_buffers = []
     if use_python_file_object:

@@ -18,15 +18,37 @@ def read_text(
     compression=None,
     compression_offsets=None,
     storage_options=None,
+    prefetch_read_ahead=None,
 ):
     """{docstring}"""
 
     if delimiter is None:
         raise ValueError("delimiter needs to be provided")
 
+    # Extract filesystem up front
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=storage_options
+    )
+
+    # Prefetch remote data if possible
+    if fs and paths:
+        filepath_or_buffer, info = ioutils.prefetch_remote_buffers(
+            paths,
+            fs,
+            prefetcher="contiguous",
+            prefetcher_options={
+                "byte_range": byte_range,
+                "read_ahead": prefetch_read_ahead,
+            },
+        )
+        assert len(filepath_or_buffer) == 1
+        filepath_or_buffer = filepath_or_buffer[0]
+        byte_range = info.get("byte_range", byte_range)
+
     filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=None,
+        fs=fs,
         iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
     )

@@ -387,6 +387,15 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
                 open_file_options={"precache_options": {"method": precache}},
             )
 
+            # Check that default case doesn't warn and is correct
+            if precache is None:
+                default = cudf.read_parquet(
+                    f"s3://{bucket}/{fname}",
+                    storage_options=s3so,
+                    filters=filters,
+                )
+                assert_eq(pdf_ext.iloc[:0], default.reset_index(drop=True))
+
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))