From 0c0c7e6c82820ea223ee2a4abf63923e3eae2e25 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 9 Feb 2024 18:23:12 -0600 Subject: [PATCH] Add `future_stack` to `DataFrame.stack` (#15015) This PR introduces `future_stack` to `stack` API. This also means deprecating `dropna`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15015 --- python/cudf/cudf/core/dataframe.py | 47 +++++++++++++++++++++----- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 33 +++++++++++++++--- 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 727d5135297..1a6376d1c00 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6711,7 +6711,7 @@ def to_orc( ) @_cudf_nvtx_annotate - def stack(self, level=-1, dropna=True): + def stack(self, level=-1, dropna=no_default, future_stack=False): """Stack the prescribed level(s) from columns to index Return a reshaped DataFrame or Series having a multi-level @@ -6843,6 +6843,23 @@ def stack(self, level=-1, dropna=True): weight kg 3.0 dtype: float64 """ + if future_stack: + if dropna is not no_default: + raise ValueError( + "dropna must be unspecified with future_stack=True as the new " + "implementation does not introduce rows of NA values. This " + "argument will be removed in a future version of cudf." + ) + else: + if dropna is not no_default or self._data.nlevels > 1: + warnings.warn( + "The previous implementation of stack is deprecated and will be " + "removed in a future version of cudf. Specify future_stack=True " + "to adopt the new implementation and silence this warning.", + FutureWarning, + ) + if dropna is no_default: + dropna = True if isinstance(level, (int, str)): level = [level] @@ -6858,7 +6875,7 @@ def stack(self, level=-1, dropna=True): level = [level] if not isinstance(level, list) else level - if len(level) > 1 and not dropna: + if not future_stack and len(level) > 1 and not dropna: raise NotImplementedError( "When stacking multiple levels, setting `dropna` to False " "will generate new column combination that does not exist " @@ -6900,7 +6917,9 @@ def stack(self, level=-1, dropna=True): # Since `level` may only specify a subset of all levels, `unique()` is # required to remove duplicates. In pandas, the order of the keys in # the specified levels are always sorted. - unique_named_levels = named_levels.unique().sort_values() + unique_named_levels = named_levels.unique() + if not future_stack: + unique_named_levels = unique_named_levels.sort_values() # Each index from the original dataframe should repeat by the number # of unique values in the named_levels @@ -6949,11 +6968,19 @@ def unnamed_group_generator(): # `unique_named_levels` assigns -1 to these key # combinations, representing an all-null column that # is used in the subsequent libcudf call. - yield grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ).sort_index().values + if future_stack: + yield grpdf.reindex( + unique_named_levels, axis=0, fill_value=-1 + ).values + else: + yield grpdf.reindex( + unique_named_levels, axis=0, fill_value=-1 + ).sort_index().values else: - yield column_idx_df.sort_index().values + if future_stack: + yield column_idx_df.values + else: + yield column_idx_df.sort_index().values column_indices = list(unnamed_group_generator()) @@ -7004,6 +7031,10 @@ def unnamed_group_generator(): [ stacked[i] for i in unnamed_level_values.argsort().argsort() + ] + if not future_stack + else [ + stacked[i] for i in unnamed_level_values.argsort() ], ) ), @@ -7013,7 +7044,7 @@ def unnamed_group_generator(): result = DataFrame._from_data(data, index=new_index) - if dropna: + if not future_stack and dropna: return result.dropna(how="all") else: return result diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 2ea538d66a1..656db855253 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1120,7 +1120,7 @@ def unstack(df, level, fill_value=None): "Calling unstack() on single index dataframe" " with different column datatype is not supported." ) - res = df.T.stack(dropna=False) + res = df.T.stack(future_stack=False) # Result's index is a multiindex res.index.names = ( tuple(df._data.to_pandas_index().names) + df.index.names diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b49a921e812..59c5a0662be 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -9,12 +9,14 @@ import cudf from cudf import melt as cudf_melt +from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220 from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, assert_eq, + expect_warning_if, ) pytest_xfail = pytest.mark.xfail @@ -153,6 +155,10 @@ def test_df_stack_reset_index(): assert_eq(expected, actual) +@pytest.mark.skipif( + not PANDAS_GE_210, + reason="Need pandas-2.1.0+ to match `stack` api", +) @pytest.mark.parametrize( "columns", [ @@ -206,8 +212,15 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna): ) gdf = cudf.from_pandas(pdf) - got = gdf.stack(level=level, dropna=dropna) - expect = pdf.stack(level=level, dropna=dropna) + with pytest.warns(FutureWarning): + got = gdf.stack(level=level, dropna=dropna, future_stack=False) + with expect_warning_if(PANDAS_GE_220): + expect = pdf.stack(level=level, dropna=dropna, future_stack=False) + + assert_eq(expect, got, check_dtype=False) + + got = gdf.stack(level=level, future_stack=True) + expect = pdf.stack(level=level, future_stack=True) assert_eq(expect, got, check_dtype=False) @@ -228,6 +241,10 @@ def test_df_stack_mixed_dtypes(): assert_eq(expect, got, check_dtype=False) +@pytest.mark.skipif( + not PANDAS_GE_210, + reason="Need pandas-2.1.0+ to match `stack` api", +) @pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]]) def test_df_stack_multiindex_column_axis_pd_example(level): columns = pd.MultiIndex.from_tuples( @@ -242,8 +259,16 @@ def test_df_stack_multiindex_column_axis_pd_example(level): df = pd.DataFrame(np.random.randn(4, 4), columns=columns) - expect = df.stack(level=level) - got = cudf.from_pandas(df).stack(level=level) + with expect_warning_if(PANDAS_GE_220): + expect = df.stack(level=level, future_stack=False) + gdf = cudf.from_pandas(df) + with pytest.warns(FutureWarning): + got = gdf.stack(level=level, future_stack=False) + + assert_eq(expect, got) + + expect = df.stack(level=level, future_stack=True) + got = gdf.stack(level=level, future_stack=True) assert_eq(expect, got)