Add future_stack to DataFrame.stack (rapidsai#15015)

This PR introduces `future_stack` to `stack` API. This also means deprecating `dropna`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: rapidsai#15015
singhmanas1 · Feb 10, 2024 · 0c0c7e6 · 0c0c7e6
1 parent e596480
commit 0c0c7e6
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 13 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -6711,7 +6711,7 @@ def to_orc(
         )
 
     @_cudf_nvtx_annotate
-    def stack(self, level=-1, dropna=True):
+    def stack(self, level=-1, dropna=no_default, future_stack=False):
         """Stack the prescribed level(s) from columns to index
 
         Return a reshaped DataFrame or Series having a multi-level
@@ -6843,6 +6843,23 @@ def stack(self, level=-1, dropna=True):
              weight  kg    3.0
         dtype: float64
         """
+        if future_stack:
+            if dropna is not no_default:
+                raise ValueError(
+                    "dropna must be unspecified with future_stack=True as the new "
+                    "implementation does not introduce rows of NA values. This "
+                    "argument will be removed in a future version of cudf."
+                )
+        else:
+            if dropna is not no_default or self._data.nlevels > 1:
+                warnings.warn(
+                    "The previous implementation of stack is deprecated and will be "
+                    "removed in a future version of cudf. Specify future_stack=True "
+                    "to adopt the new implementation and silence this warning.",
+                    FutureWarning,
+                )
+            if dropna is no_default:
+                dropna = True
 
         if isinstance(level, (int, str)):
             level = [level]
@@ -6858,7 +6875,7 @@ def stack(self, level=-1, dropna=True):
 
         level = [level] if not isinstance(level, list) else level
 
-        if len(level) > 1 and not dropna:
+        if not future_stack and len(level) > 1 and not dropna:
             raise NotImplementedError(
                 "When stacking multiple levels, setting `dropna` to False "
                 "will generate new column combination that does not exist "
@@ -6900,7 +6917,9 @@ def stack(self, level=-1, dropna=True):
         # Since `level` may only specify a subset of all levels, `unique()` is
         # required to remove duplicates. In pandas, the order of the keys in
         # the specified levels are always sorted.
-        unique_named_levels = named_levels.unique().sort_values()
+        unique_named_levels = named_levels.unique()
+        if not future_stack:
+            unique_named_levels = unique_named_levels.sort_values()
 
         # Each index from the original dataframe should repeat by the number
         # of unique values in the named_levels
@@ -6949,11 +6968,19 @@ def unnamed_group_generator():
                     # `unique_named_levels` assigns -1 to these key
                     # combinations, representing an all-null column that
                     # is used in the subsequent libcudf call.
-                    yield grpdf.reindex(
-                        unique_named_levels, axis=0, fill_value=-1
-                    ).sort_index().values
+                    if future_stack:
+                        yield grpdf.reindex(
+                            unique_named_levels, axis=0, fill_value=-1
+                        ).values
+                    else:
+                        yield grpdf.reindex(
+                            unique_named_levels, axis=0, fill_value=-1
+                        ).sort_index().values
             else:
-                yield column_idx_df.sort_index().values
+                if future_stack:
+                    yield column_idx_df.values
+                else:
+                    yield column_idx_df.sort_index().values
 
         column_indices = list(unnamed_group_generator())
 
@@ -7004,6 +7031,10 @@ def unnamed_group_generator():
                         [
                             stacked[i]
                             for i in unnamed_level_values.argsort().argsort()
+                        ]
+                        if not future_stack
+                        else [
+                            stacked[i] for i in unnamed_level_values.argsort()
                         ],
                     )
                 ),
@@ -7013,7 +7044,7 @@ def unnamed_group_generator():
 
             result = DataFrame._from_data(data, index=new_index)
 
-        if dropna:
+        if not future_stack and dropna:
             return result.dropna(how="all")
         else:
             return result

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
@@ -1120,7 +1120,7 @@ def unstack(df, level, fill_value=None):
                     "Calling unstack() on single index dataframe"
                     " with different column datatype is not supported."
                 )
-        res = df.T.stack(dropna=False)
+        res = df.T.stack(future_stack=False)
         # Result's index is a multiindex
         res.index.names = (
             tuple(df._data.to_pandas_index().names) + df.index.names

diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
@@ -9,12 +9,14 @@
 
 import cudf
 from cudf import melt as cudf_melt
+from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
     assert_eq,
+    expect_warning_if,
 )
 
 pytest_xfail = pytest.mark.xfail
@@ -153,6 +155,10 @@ def test_df_stack_reset_index():
     assert_eq(expected, actual)
 
 
+@pytest.mark.skipif(
+    not PANDAS_GE_210,
+    reason="Need pandas-2.1.0+ to match `stack` api",
+)
 @pytest.mark.parametrize(
     "columns",
     [
@@ -206,8 +212,15 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
     )
     gdf = cudf.from_pandas(pdf)
 
-    got = gdf.stack(level=level, dropna=dropna)
-    expect = pdf.stack(level=level, dropna=dropna)
+    with pytest.warns(FutureWarning):
+        got = gdf.stack(level=level, dropna=dropna, future_stack=False)
+    with expect_warning_if(PANDAS_GE_220):
+        expect = pdf.stack(level=level, dropna=dropna, future_stack=False)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    got = gdf.stack(level=level, future_stack=True)
+    expect = pdf.stack(level=level, future_stack=True)
 
     assert_eq(expect, got, check_dtype=False)
 
@@ -228,6 +241,10 @@ def test_df_stack_mixed_dtypes():
     assert_eq(expect, got, check_dtype=False)
 
 
+@pytest.mark.skipif(
+    not PANDAS_GE_210,
+    reason="Need pandas-2.1.0+ to match `stack` api",
+)
 @pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]])
 def test_df_stack_multiindex_column_axis_pd_example(level):
     columns = pd.MultiIndex.from_tuples(
@@ -242,8 +259,16 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
 
     df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
 
-    expect = df.stack(level=level)
-    got = cudf.from_pandas(df).stack(level=level)
+    with expect_warning_if(PANDAS_GE_220):
+        expect = df.stack(level=level, future_stack=False)
+    gdf = cudf.from_pandas(df)
+    with pytest.warns(FutureWarning):
+        got = gdf.stack(level=level, future_stack=False)
+
+    assert_eq(expect, got)
+
+    expect = df.stack(level=level, future_stack=True)
+    got = gdf.stack(level=level, future_stack=True)
 
     assert_eq(expect, got)