Skip to content

Commit

Permalink
notimplemented for partitioned as well
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Jul 9, 2024
1 parent 0c722da commit cc37737
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
11 changes: 5 additions & 6 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,11 @@ def _parquet_to_frame(
**kwargs,
)

if nrows is not None or skip_rows is not None:
raise NotImplementedError(
"nrows/skip_rows is not supported when reading a partitioned parquet dataset"
)

partition_meta = None
partitioning = (dataset_kwargs or {}).get("partitioning", None)
if hasattr(partitioning, "schema"):
Expand Down Expand Up @@ -842,12 +847,6 @@ def _parquet_to_frame(
key_paths,
*args,
row_groups=key_row_groups,
# TODO: is this still right?
# Also, do we still care?
# partition_keys uses pyarrow dataset
# (which we can't use anymore after pyarrow is gone)
nrows=nrows,
skip_rows=skip_rows,
**kwargs,
)
)
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1970,6 +1970,26 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
assert fn == filename


@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
# Checks that write_to_dataset is wrapping to_parquet
# as expected
pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
size = 100
pdf = pd.DataFrame(
{
"a": np.arange(0, stop=size, dtype="int64"),
"b": np.random.choice(list("abcd"), size=size),
"c": np.random.choice(np.arange(4), size=size),
}
)
pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])

# Check that cudf and pd return the same read
with pytest.raises(NotImplementedError):
cudf.read_parquet(pdf_dir, **kwargs)


@pytest.mark.parametrize("return_meta", [True, False])
def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
Expand Down

0 comments on commit cc37737

Please sign in to comment.