Skip to content

Commit

Permalink
Updates from code reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 committed Sep 11, 2024
1 parent 511eb20 commit d2051e7
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 20 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def write_parquet(
object statistics="ROWGROUP",
object metadata_file_path=None,
object int96_timestamps=False,
object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
object row_group_size_bytes=None,
object row_group_size_rows=None,
object max_page_size_bytes=None,
object max_page_size_rows=None,
Expand Down Expand Up @@ -616,9 +616,9 @@ cdef class ParquetWriter:
Name of the compression to use. Use ``None`` for no compression.
statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
Level at which column statistics should be included in file.
row_group_size_bytes: int, default 18446744073709551615
row_group_size_bytes: int, default None
Maximum size of each stripe of the output.
By default, an infinite value equal to uint64 max (~18446744074GB) will be used.
By default, no limit on row group stripe size will be used.
row_group_size_rows: int, default 1000000
Maximum number of rows of each stripe of the output.
By default, 1000000 (10^6 rows) will be used.
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6840,7 +6840,7 @@ def to_parquet(
statistics="ROWGROUP",
metadata_file_path=None,
int96_timestamps=False,
row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _write_parquet(
statistics="ROWGROUP",
metadata_file_path=None,
int96_timestamps=False,
row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
Expand Down Expand Up @@ -149,7 +149,7 @@ def write_to_dataset(
return_metadata=False,
statistics="ROWGROUP",
int96_timestamps=False,
row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
Expand Down Expand Up @@ -205,7 +205,7 @@ def write_to_dataset(
If ``False``, timestamps will not be altered.
row_group_size_bytes: integer or None, default None
Maximum size of each stripe of the output.
If None, 134217728 (128MB) will be used.
If None, no limit on row group stripe size will be used.
row_group_size_rows: integer or None, default None
Maximum number of rows of each stripe of the output.
If None, 1000000 will be used.
Expand Down Expand Up @@ -980,7 +980,7 @@ def to_parquet(
statistics="ROWGROUP",
metadata_file_path=None,
int96_timestamps=False,
row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
row_group_size_bytes=None,
row_group_size_rows=None,
max_page_size_bytes=None,
max_page_size_rows=None,
Expand Down
10 changes: 3 additions & 7 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,9 @@
timestamp[us] to the int96 format, which is the number of Julian
days and the number of nanoseconds since midnight of 1970-01-01.
If ``False``, timestamps will not be altered.
row_group_size_bytes: integer, default {row_group_size_bytes_val}
row_group_size_bytes: integer, default None
Maximum size of each stripe of the output.
If None, {row_group_size_bytes_val}
({row_group_size_bytes_val_in_mb} MB) will be used.
If None, no limit on row group stripe size will be used.
row_group_size_rows: integer or None, default None
Maximum number of rows of each stripe of the output.
If None, 1000000 will be used.
Expand Down Expand Up @@ -346,10 +345,7 @@
See Also
--------
cudf.read_parquet
""".format(
row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
)
"""
doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)

_docstring_merge_parquet_filemetadata = """
Expand Down
7 changes: 2 additions & 5 deletions python/dask_cudf/dask_cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from cudf.io import write_to_dataset
from cudf.io.parquet import _apply_post_filters, _normalize_filters
from cudf.utils.dtypes import cudf_dtype_from_pa_type
from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT


class CudfEngine(ArrowDatasetEngine):
Expand Down Expand Up @@ -341,9 +340,7 @@ def write_partition(
return_metadata=return_metadata,
statistics=kwargs.get("statistics", "ROWGROUP"),
int96_timestamps=kwargs.get("int96_timestamps", False),
row_group_size_bytes=kwargs.get(
"row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
),
row_group_size_bytes=kwargs.get("row_group_size_bytes", None),
row_group_size_rows=kwargs.get("row_group_size_rows", None),
max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
max_page_size_rows=kwargs.get("max_page_size_rows", None),
Expand All @@ -365,7 +362,7 @@ def write_partition(
statistics=kwargs.get("statistics", "ROWGROUP"),
int96_timestamps=kwargs.get("int96_timestamps", False),
row_group_size_bytes=kwargs.get(
"row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
"row_group_size_bytes", None
),
row_group_size_rows=kwargs.get(
"row_group_size_rows", None
Expand Down

0 comments on commit d2051e7

Please sign in to comment.