From c1689dcc79502716ddd68813e0ecbade06476b18 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 10:40:13 -0700 Subject: [PATCH 1/9] deterministic default seed in randomdata --- python/cudf/cudf/datasets.py | 4 ++-- python/cudf/cudf/tests/test_orc.py | 6 +++--- python/dask_cudf/dask_cudf/io/tests/test_orc.py | 6 ++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index c6091ab60fc..a380b83232a 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -80,7 +80,7 @@ def timeseries( return gdf -def randomdata(nrows=10, dtypes=None, seed=None): +def randomdata(nrows=10, dtypes=None, seed=1): """Create a dataframe with random data Parameters diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index aafc8831bf4..24bff5a948c 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -468,7 +468,7 @@ def test_chunked_orc_writer( def test_orc_writer_strings(tmpdir, dtypes): gdf_fname = tmpdir.join("gdf_strings.orc") - expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) + expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes) expect.to_orc(gdf_fname) got = pd.read_orc(gdf_fname) @@ -487,7 +487,7 @@ def test_orc_writer_strings(tmpdir, dtypes): def test_chunked_orc_writer_strings(tmpdir, dtypes): gdf_fname = tmpdir.join("chunked_gdf_strings.orc") - gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) + gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes) pdf = gdf.to_pandas() expect = pd.concat([pdf, pdf]).reset_index(drop=True) writer = ORCWriter(gdf_fname) @@ -1661,7 +1661,7 @@ def test_writer_protobuf_large_rowindexentry(): @pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) def test_orc_writer_nvcomp(compression): expected = cudf.datasets.randomdata( - nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1 + nrows=12345, dtypes={"a": int, "b": str, "c": float} ) buff = BytesIO() diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 5565a44c7d8..52adf3e90c0 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import glob import os @@ -85,7 +85,6 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len): def test_read_orc_first_file_empty(tmpdir): - # Write a 3-file dataset where the first file is empty # See: https://github.com/rapidsai/cudf/issues/8011 path = str(tmpdir) @@ -112,9 +111,8 @@ def test_read_orc_first_file_empty(tmpdir): ], ) def test_to_orc(tmpdir, dtypes, compression, compute): - # Create cudf and dask_cudf dataframes - df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) + df = cudf.datasets.randomdata(nrows=10, dtypes=dtypes) df = df.set_index("index").sort_index() ddf = dask_cudf.from_cudf(df, npartitions=3) From aa8d4c894602d7df6d4f356f705070af57d41734 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 11:04:49 -0700 Subject: [PATCH 2/9] deterministic set_random_null_mask_inplace --- python/cudf/cudf/testing/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 0489329d801..4e992e09b32 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -49,7 +49,7 @@ ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES)) -def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): +def set_random_null_mask_inplace(series, null_probability=0.5, seed=1): """Randomly nullify elements in series with the provided probability.""" probs = [null_probability, 1 - null_probability] rng = np.random.default_rng(seed=seed) From 52f439a923f3dd22a11d39b606aefec4a36e33f4 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 11:41:35 -0700 Subject: [PATCH 3/9] deterministic rand_dataframe --- python/cudf/cudf/testing/dataset_generator.py | 11 ++++------- python/cudf/cudf/tests/test_groupby.py | 3 --- python/cudf/cudf/tests/test_parquet.py | 2 -- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 1ba205275f3..ca14490b029 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -71,7 +71,7 @@ class Parameters: Number of rows to generate column_parameters : List[ColumnParams] ColumnParams for each column - seed : int or None, default None + seed : int, default 1 Seed for random data generation """ @@ -79,7 +79,7 @@ def __init__( self, num_rows=2048, column_parameters=None, - seed=None, + seed=1, ): self.num_rows = num_rows if column_parameters is None: @@ -232,8 +232,7 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds - if parameters.seed is not None: - np.random.seed(parameters.seed) + np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data @@ -312,9 +311,7 @@ def get_dataframe(parameters, use_threads): return tbl -def rand_dataframe( - dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True -): +def rand_dataframe(dtypes_meta, rows, seed=1, use_threads=True): """ Generates a random table. diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 042f0e1aa38..f0d0fbd9a31 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2550,7 +2550,6 @@ def test_groupby_fillna_multi_value(nelem): ], rows=nelem, use_threads=False, - seed=0, ) key_col = "0" value_cols = ["1", "2", "3", "4", "5", "6"] @@ -2595,7 +2594,6 @@ def test_groupby_fillna_multi_value_df(nelem): ], rows=nelem, use_threads=False, - seed=0, ) key_col = "0" value_cols = ["1", "2", "3", "4", "5"] @@ -2671,7 +2669,6 @@ def test_groupby_fillna_method(nelem, method): ], rows=nelem, use_threads=False, - seed=0, ) key_col = "0" value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"] diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b892cc62ac4..7b1d5aef641 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1316,7 +1316,6 @@ def test_delta_binary(nrows, add_nulls, tmpdir): }, ], rows=nrows, - seed=0, use_threads=False, ) # Roundabout conversion to pandas to preserve nulls/data types @@ -1469,7 +1468,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2) test_pdf2 = make_pdf(nrows=20) expect = pd.concat([test_pdf1, test_pdf2]) From 10e9e416d4d2bf590a8f8463c01d1aee7a13846e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 11:41:50 -0700 Subject: [PATCH 4/9] deterministic timeseries --- python/cudf/cudf/datasets.py | 2 +- python/cudf/cudf/tests/test_datasets.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index a380b83232a..06e06d9277f 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -18,7 +18,7 @@ def timeseries( freq="1s", dtypes=None, nulls_frequency=0, - seed=None, + seed=1, ): """Create timeseries dataframe with random data diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index 98f801d0cba..9c27a903e12 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -8,10 +8,10 @@ def test_dataset_timeseries(): gdf1 = gd.datasets.timeseries( - dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 + dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3 ) gdf2 = gd.datasets.timeseries( - dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 + dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3 ) assert_eq(gdf1, gdf2) @@ -26,7 +26,6 @@ def test_dataset_timeseries(): freq="2H", dtypes={"value": float, "name": "category", "id": int}, nulls_frequency=0.7, - seed=1, ) assert gdf["value"].head().dtype == float From a87641f235fe7ef325ce20ec17370954054a694f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 11:43:33 -0700 Subject: [PATCH 5/9] style --- python/cudf/cudf/tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index 9c27a903e12..71d76a268c2 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import numpy as np From 515ceac15207c25075e91c46c69dc5bef32d9446 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 12:10:49 -0700 Subject: [PATCH 6/9] deterministic sample --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b300c55b537..bcb8d36171f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -950,7 +950,7 @@ def sample( frac: Optional[float] = None, replace: bool = False, weights: Union[abc.Sequence, "cudf.Series", None] = None, - random_state: Union[np.random.RandomState, int, None] = None, + random_state: Union[np.random.RandomState, int, None] = 1, ): """Return a random sample of items in each group. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 69b25c51a66..afb10f77afb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3346,7 +3346,7 @@ def sample( frac=None, replace=False, weights=None, - random_state=None, + random_state=1, axis=None, ignore_index=False, ): @@ -3387,7 +3387,7 @@ def sample( equal to the number of rows to sample from, and will be normalized to have a sum of 1. Unlike pandas, index alignment is not currently not performed. - random_state : int, numpy/cupy RandomState, or None, default None + random_state : int, numpy/cupy RandomState, or None, default 1 If None, default cupy random state is chosen. If int, the seed for the default cupy random state. If RandomState, rows-to-sample are generated from the RandomState. From 6bcd0e88f778eb7ba91b82b6e25a2ca7654fc2e0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 12:29:57 -0700 Subject: [PATCH 7/9] allow None seed in get_dataframe --- python/cudf/cudf/testing/dataset_generator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index ca14490b029..e75871a4855 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -71,7 +71,7 @@ class Parameters: Number of rows to generate column_parameters : List[ColumnParams] ColumnParams for each column - seed : int, default 1 + seed : int or None, default 1 Seed for random data generation """ @@ -232,7 +232,8 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds - np.random.seed(parameters.seed) + if parameters.seed is not None: + np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data From 6e357587fa2eca8ae565fd9aaa1b460e5b81992a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 15:20:21 -0700 Subject: [PATCH 8/9] make sure masks are unique in test_binary_ufunc_series_array --- python/cudf/cudf/tests/test_array_ufunc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 81950bb8bde..d2bfca177d7 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -181,8 +181,8 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): # Converting nullable integer cudf.Series to pandas will produce a # float pd.Series, so instead we replace nulls with an arbitrary # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) + for idx, arg in enumerate(args): + set_random_null_mask_inplace(arg, seed=idx) pandas_args = [arg.fillna(0) for arg in args] # Note: Different indexes must be aligned before the mask is computed. From fbd023dd4e349c256b465a47fdacd052573b7068 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Sep 2023 16:49:45 -0700 Subject: [PATCH 9/9] make sure masks are unique in a few more spots --- python/cudf/cudf/tests/test_array_ufunc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index d2bfca177d7..ed5934402c5 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -261,8 +261,8 @@ def test_binary_ufunc_series_array( # Converting nullable integer cudf.Series to pandas will produce a # float pd.Series, so instead we replace nulls with an arbitrary # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) + for idx, arg in enumerate(args): + set_random_null_mask_inplace(arg, seed=idx) # Cupy doesn't support nulls, so we fill with nans before converting. args[1] = args[1].fillna(cp.nan) @@ -403,8 +403,8 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): # Converting nullable integer cudf.Series to pandas will produce a # float pd.Series, so instead we replace nulls with an arbitrary # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg["foo"]) + for idx, arg in enumerate(args): + set_random_null_mask_inplace(arg["foo"], seed=idx) pandas_args = [arg.copy() for arg in args] for arg in pandas_args: arg["foo"] = arg["foo"].fillna(0)