raise notimplemented for chunked parquet reader nrows/skiprows

rapidsai · Jul 8, 2024 · f52b606 · f52b606
1 parent 56c88ed
commit f52b606
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 4 deletions.
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -911,10 +911,6 @@ def _read_parquet(
     *args,
     **kwargs,
 ):
-    if nrows is None:
-        nrows = -1
-    if skip_rows is None:
-        skip_rows = 0
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
@@ -933,13 +929,25 @@ def _read_parquet(
             # (It's not super important now since pandas doesn't support it ATM,
             # but may be relevant in the future)
             # xref https://github.com/pandas-dev/pandas/issues/51830
+            if nrows is not None:
+                raise NotImplementedError(
+                    "pandas compatibility mode doesn't support nrows in read_parquet"
+                )
+            if skip_rows is not None:
+                raise NotImplementedError(
+                    "pandas compatibility mode doesn't support skip_rows in read_parquet"
+                )
             return libparquet.ParquetReader(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
             ).read()
         else:
+            if nrows is None:
+                nrows = -1
+            if skip_rows is None:
+                skip_rows = 0
             return libparquet.read_parquet(
                 filepaths_or_buffers,
                 columns=columns,

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -3498,6 +3498,22 @@ def test_parquet_reader_pandas_compatibility():
     assert_eq(expected, df)
 
 
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"skip_rows": 0},
+        {"nrows": 1},
+    ],
+)
+def test_parquet_reader_pandas_compatibility_unsupported(kwargs):
+    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["av", "qw", "hi", "xyz"]})
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.read_parquet(buffer, **kwargs)
+
+
 @pytest.mark.parametrize("row_group_size", [1, 4, 33])
 def test_parquet_read_rows(tmpdir, pdf, row_group_size):
     if len(pdf) > 100:

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -192,9 +192,15 @@
     This parameter is functional only when `use_python_file_object=False`.
 skiprows : int, default None
     If not None, the number of rows to skip from the start of the file.
+
+    .. pandas-compat::
+    This option is not supported when pandas compatibility mode is on.
 nrows : int, default None
     If not None, the total number of rows to read.
 
+    .. pandas-compat::
+    This option is not supported when pandas compatibility mode is on.
+
 Returns
 -------
 DataFrame