Make tests deterministic (#16910)

This PR is a first pass of making tests deterministic, I noticed one of CI job failed due to an overflow error related to random data generation. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: #16910
rapidsai · Sep 25, 2024 · ba4afae · ba4afae
1 parent 987fea3
commit ba4afae
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 16 deletions.
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
@@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
+np.random.seed(0)
+
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs):
         lambda x: np.linalg.norm(x),
     ],
 )
-def test_array_func_cudf_series(np_ar, func):
+def test_array_func_cudf_series(func):
+    np_ar = np.random.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -74,17 +73,15 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.prod(x, axis=1),
     ],
 )
-def test_array_func_cudf_dataframe(pd_df, func):
+def test_array_func_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
     assert_eq(expect, got)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize(
-    "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]
-)
 @pytest.mark.parametrize(
     "func",
     [
@@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_dataframe(pd_df, func):
+def test_array_func_missing_cudf_dataframe(func):
+    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
         lambda x: np.unique(x),
     ],
 )
-def test_array_func_cudf_index(np_ar, func):
+def test_array_func_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func):
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
-@pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
     "func",
     [
@@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func):
         lambda x: np.linalg.det(x),
     ],
 )
-def test_array_func_missing_cudf_index(np_ar, func):
+def test_array_func_missing_cudf_index(func):
+    np_ar = np.random.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)

diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -236,6 +236,7 @@ def test_avro_compression(rows, codec):
             },
         ],
         rows,
+        seed=0,
     )
     expected_df = cudf.DataFrame.from_arrow(df)
 
@@ -599,7 +600,7 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-
+        np.random.seed(0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -2470,6 +2470,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     pdf.columns = ["x", "y", "z"]
@@ -2602,6 +2603,7 @@ def test_groupby_shift_row_mixed_numerics(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2639,6 +2641,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2687,6 +2690,7 @@ def test_groupby_shift_row_mixed_fill(
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2732,6 +2736,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 
@@ -2782,6 +2787,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     pdf = t.to_pandas()
     gdf = cudf.from_pandas(pdf)
@@ -2815,6 +2821,7 @@ def test_groupby_diff_row_zero_shift(nelem):
         ],
         rows=nelem,
         use_threads=False,
+        seed=0,
     )
     gdf = cudf.from_pandas(t.to_pandas())
 

diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,6 +13,7 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
+    np.random.seed(0)
     df = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=nelem),
@@ -38,7 +39,6 @@ def wrapped(series):
 @pytest.mark.parametrize("reducer", _reducers)
 def test_series_reduce(reducer):
     reducer = _get_reduce_fn(reducer)
-    np.random.seed(0)
     size = 10
     df, gdf = _make_random_frame(size)