diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 8c8b66dba0b..f205a615a80 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -813,6 +813,11 @@ def _parquet_to_frame( **kwargs, ) + if nrows is not None or skip_rows is not None: + raise NotImplementedError( + "nrows/skip_rows is not supported when reading a partitioned parquet dataset" + ) + partition_meta = None partitioning = (dataset_kwargs or {}).get("partitioning", None) if hasattr(partitioning, "schema"): @@ -842,12 +847,6 @@ def _parquet_to_frame( key_paths, *args, row_groups=key_row_groups, - # TODO: is this still right? - # Also, do we still care? - # partition_keys uses pyarrow dataset - # (which we can't use anymore after pyarrow is gone) - nrows=nrows, - skip_rows=skip_rows, **kwargs, ) ) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index c50d5a93442..18d19e7050f 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1970,6 +1970,26 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): assert fn == filename +@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) +def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): + # Checks that write_to_dataset is wrapping to_parquet + # as expected + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + size = 100 + pdf = pd.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": np.random.choice(list("abcd"), size=size), + "c": np.random.choice(np.arange(4), size=size), + } + ) + pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) + + # Check that cudf and pd return the same read + with pytest.raises(NotImplementedError): + cudf.read_parquet(pdf_dir, **kwargs) + + @pytest.mark.parametrize("return_meta", [True, False]) def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))