diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 52feacd5863..8518f427887 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 18446744073709551615 + row_group_size_bytes: int, default None Maximum size of each stripe of the output. - By default, an infinite value equal to uint64 max (~18446744074GB) will be used. + By default, no limit on row group stripe size will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..d73ad8225ca 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1780fbc93b7..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None