From f932bf9c62f73aabee2ac094180036399ce88dcf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:28:37 -1000 Subject: [PATCH] Fix Series.to_frame(name=None) setting a None name (#16698) In pandas 2.0, `to_frame(name=None)` allowed the resulting column name to be `None` https://github.com/pandas-dev/pandas/pull/45523 Looks like based on the current default of `cudf.Series.to_frame`, this behavior was not reflected. Additionally, created a `SingleColumnFrame._to_frame` to more easily share the logic between `Series.to_frame` and `Index.to_frame` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16698 --- python/cudf/cudf/core/_base_index.py | 58 -------------------- python/cudf/cudf/core/index.py | 57 +++++++++++++++++++ python/cudf/cudf/core/series.py | 12 +--- python/cudf/cudf/core/single_column_frame.py | 11 ++++ python/cudf/cudf/tests/test_series.py | 7 +++ 5 files changed, 77 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a224e0ce0d0..ff114474aa4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -798,64 +798,6 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def to_frame(self, index=True, name=no_default): - """Create a DataFrame with a column containing this Index - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index - name : object, defaults to index.name - The passed name should substitute for the index name (if it has - one). - - Returns - ------- - DataFrame - DataFrame containing the original Index data. - - See Also - -------- - Index.to_series : Convert an Index to a Series. - Series.to_frame : Convert Series to DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') - >>> idx.to_frame() - animal - animal - Ant Ant - Bear Bear - Cow Cow - - By default, the original Index is reused. To enforce a new Index: - - >>> idx.to_frame(index=False) - animal - 0 Ant - 1 Bear - 2 Cow - - To override the name of the resulting column, specify `name`: - - >>> idx.to_frame(index=False, name='zoo') - zoo - 0 Ant - 1 Bear - 2 Cow - """ - - if name is no_default: - col_name = 0 if self.name is None else self.name - else: - col_name = name - - return cudf.DataFrame( - {col_name: self._values}, index=self if index else None - ) - def to_arrow(self): """Convert to a suitable Arrow object.""" raise NotImplementedError diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 66d03682de4..b2bd20c4982 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -529,6 +529,11 @@ def to_pandas( name=self.name, ) + def to_frame( + self, index: bool = True, name: Hashable = no_default + ) -> cudf.DataFrame: + return self._as_int_index().to_frame(index=index, name=name) + @property def is_unique(self) -> bool: return True @@ -1646,6 +1651,58 @@ def to_pandas( result.name = self.name return result + def to_frame( + self, index: bool = True, name: Hashable = no_default + ) -> cudf.DataFrame: + """Create a DataFrame with a column containing this Index + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index + name : object, defaults to index.name + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + return self._to_frame(name=name, index=self if index else None) + def append(self, other): if is_list_like(other): to_concat = [self] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 837c6872258..aadbd80f4b4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1160,7 +1160,7 @@ def reset_index( ) @_performance_tracking - def to_frame(self, name=None): + def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame: """Convert Series into a DataFrame Parameters @@ -1192,15 +1192,7 @@ def to_frame(self, name=None): 13 15 d """ # noqa: E501 - - if name is not None: - col = name - elif self.name is None: - col = 0 - else: - col = self.name - - return cudf.DataFrame({col: self._column}, index=self.index) + return self._to_frame(name=name, index=self.index) @_performance_tracking def memory_usage(self, index=True, deep=False): diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 55dda34a576..0e66f383ca0 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -158,6 +158,17 @@ def to_arrow(self) -> pa.Array: """ return self._column.to_arrow() + def _to_frame( + self, name: Hashable, index: cudf.Index | None + ) -> cudf.DataFrame: + """Helper function for Series.to_frame, Index.to_frame""" + if name is no_default: + col_name = 0 if self.name is None else self.name + else: + col_name = name + ca = ColumnAccessor({col_name: self._column}, verify=False) + return cudf.DataFrame._from_data(ca, index=index) + @property # type: ignore @_performance_tracking def is_unique(self) -> bool: diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8d673e23ab2..a24002dc38e 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2557,6 +2557,13 @@ def test_series_arrow_list_types_roundtrip(): cudf.from_pandas(pdf) +@pytest.mark.parametrize("base_name", [None, "a"]) +def test_series_to_frame_none_name(base_name): + result = cudf.Series(range(1), name=base_name).to_frame(name=None) + expected = pd.Series(range(1), name=base_name).to_frame(name=None) + assert_eq(result, expected) + + @pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) @pytest.mark.parametrize( "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])]