From bdcb19276226871762b76409f2f0df5a26163da7 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 21 May 2024 21:40:52 +0530 Subject: [PATCH 01/66] DOC: add RT03 for pandas.interval_range (#58800) --- ci/code_checks.sh | 1 - pandas/core/indexes/interval.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ec46e043c8294..a93e23a0f5022 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -441,7 +441,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ - -i "pandas.interval_range RT03" \ -i "pandas.io.formats.style.Styler.apply RT03" \ -i "pandas.io.formats.style.Styler.apply_index RT03" \ -i "pandas.io.formats.style.Styler.background_gradient RT03" \ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 92c399da21ce6..359cdf880937b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1138,6 +1138,7 @@ def interval_range( Returns ------- IntervalIndex + Object with a fixed frequency. See Also -------- From b9912740f4c788e5f0b63d28a3bfdb4f8b8d34e7 Mon Sep 17 00:00:00 2001 From: renanffernando <64710719+renanffernando@users.noreply.github.com> Date: Tue, 21 May 2024 12:13:01 -0400 Subject: [PATCH 02/66] =?UTF-8?q?BUG:=20unstack=20with=20sort=3DFalse=20fa?= =?UTF-8?q?ils=20when=20used=20with=20the=20level=20parameter=E2=80=A6=20(?= =?UTF-8?q?#56357)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: unstack with sort=False fails when used with the level parameter (#54987) Assign new codes to labels when sort=False. This is done so that the data appears to be already sorted, fixing the bug. * Minor refactor and cleanup * Cleanup & remove test * whatsnew * Revert test removal --------- Co-authored-by: richard Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/reshape/reshape.py | 20 +++++++++++++------- pandas/tests/frame/test_stack_unstack.py | 15 +++++++++++++++ pandas/tests/reshape/test_pivot.py | 3 +-- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 731406394ed46..a15da861cfbec 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -477,7 +477,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) -- +- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7cf2e360a1d01..5426c72a356d6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -168,6 +168,9 @@ def _indexer_and_to_sort( v = self.level codes = list(self.index.codes) + if not self.sort: + # Create new codes considering that labels are already sorted + codes = [factorize(code)[0] for code in codes] levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - if self.sort: - indexer, _ = self._indexer_and_to_sort - - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values - return values + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values def _make_selectors(self) -> None: new_levels = self.new_index_levels @@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray: @cache_readonly def new_index(self) -> MultiIndex | Index: # Does not depend on values or value_columns - result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + if self.sort: + labels = self.sorted_labels[:-1] + else: + v = self.level + codes = list(self.index.codes) + labels = codes[:v] + codes[v + 1 :] + result_codes = [lab.take(self.compressor) for lab in labels] # construct the new index if len(self.new_index_levels) == 1: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 03db284d892e3..a3a1da6e57cb0 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype): [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + + result = obj.unstack(level=0, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")]) + else: + expected_columns = ["two", "one"] + expected = DataFrame( + [[1.0, 3.0], [2.0, 4.0]], + index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + columns=expected_columns, + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) + result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 97f06b0e379f4..4a13c1f5e1167 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2705,7 +2705,7 @@ def test_pivot_table_with_margins_and_numeric_column_names(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("m", [1, 10]) - def test_unstack_shares_memory(self, m): + def test_unstack_copy(self, m): # GH#56633 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) @@ -2713,6 +2713,5 @@ def test_unstack_shares_memory(self, m): df = DataFrame(values, index, np.arange(100)) df_orig = df.copy() result = df.unstack(sort=False) - assert np.shares_memory(df._values, result._values) is (m == 1) result.iloc[0, 0] = -1 tm.assert_frame_equal(df, df_orig) From 2aa155ae1cf019e902441463d1af873eaa3c803b Mon Sep 17 00:00:00 2001 From: ananiavito <48645073+ananiavito@users.noreply.github.com> Date: Tue, 21 May 2024 18:15:38 +0200 Subject: [PATCH 03/66] DOC: fix typos in User Guide (#58801) * DOC: Fix typo in merging.rst - fix typo in the User Guide * DOC: Fix typo in timeseries.rst * DOC: Fix typo in style.ipynb --- doc/source/user_guide/style.ipynb | 2 +- doc/source/user_guide/timeseries.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 43da43a983429..04ba3e5be8ff7 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -211,7 +211,7 @@ "source": [ "## Styler Object and HTML \n", "\n", - "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their exiting user interface designs.\n", + "The [Styler][styler] was originally constructed to support the wide array of HTML formatting options. Its HTML output creates an HTML `
` and leverages CSS styling language to manipulate many parameters including colors, fonts, borders, background, etc. See [here][w3schools] for more information on styling HTML tables. This allows a lot of flexibility out of the box, and even enables web developers to integrate DataFrames into their existing user interface designs.\n", "\n", "Below we demonstrate the default output, which looks very similar to the standard DataFrame HTML representation. But the HTML here has already attached some CSS classes to each cell, even if we haven't yet created any styles. We can view these by calling the [.to_html()][tohtml] method, which returns the raw HTML as string, which is useful for further processing or adding to a file - read on in [More about CSS and HTML](#More-About-CSS-and-HTML). This section will also provide a walkthrough for how to convert this default output to represent a DataFrame output that is more communicative. For example how we can build `s`:\n", "\n", diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 37413722de96f..b31249e1cf7c1 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -326,7 +326,7 @@ which can be specified. These are computed from the starting point specified by .. note:: The ``unit`` parameter does not use the same strings as the ``format`` parameter - that was discussed :ref:`above`). The + that was discussed :ref:`above`. The available units are listed on the documentation for :func:`pandas.to_datetime`. Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp From 7868a5809f9598c33bc97e217fa13a94a2fd27bb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 May 2024 10:59:01 -1000 Subject: [PATCH 04/66] REF: Use more lazy iterators (#58808) --- pandas/core/apply.py | 2 +- pandas/core/generic.py | 12 +++++++++--- pandas/io/excel/_base.py | 19 +++++++++--------- pandas/io/excel/_odfreader.py | 36 ++++++++++++++++------------------- pandas/io/excel/_xlrd.py | 11 ++++------- pandas/io/sql.py | 7 ++++--- 6 files changed, 43 insertions(+), 44 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 32e8aea7ea8ab..25836e967e948 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs): # people may aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + assert not any(kwarg == "axis" for kwarg in kwargs) return f elif hasattr(np, func) and hasattr(obj, "__array__"): # in particular exclude Window diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e91aa23fcdb4..ca60ca9b48a14 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1750,11 +1750,15 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike if `key` matches multiple labels """ axis = self._get_axis_number(axis) - other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + first_other_axes = next( + (ax for ax in range(self._AXIS_LEN) if ax != axis), None + ) if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0])._values + if first_other_axes is None: + raise ValueError("axis matched all axes") + values = self.xs(key, axis=first_other_axes)._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: @@ -1762,7 +1766,9 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike # Check for duplicates if values.ndim > 1: - if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + if first_other_axes is not None and isinstance( + self._get_axis(first_other_axes), MultiIndex + ): multi_message = ( "\n" "For a multi-index, the label must be a " diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index dd06c597c1857..1eb22d4ee9de7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -857,24 +857,23 @@ def _parse_sheet( # a row containing just the index name(s) has_index_names = False if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] + index_col_set: set[int] if isinstance(index_col, int): - index_col_list = [index_col] + index_col_set = {index_col} else: assert isinstance(index_col, Sequence) - index_col_list = index_col + index_col_set = set(index_col) # We have to handle mi without names. If any of the entries in the data # columns are not empty, this is a regular row assert isinstance(header, Sequence) if len(header) < len(data): potential_index_names = data[len(header)] - potential_data = [ - x + has_index_names = all( + x == "" or x is None for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) + if not control_row[i] and i not in index_col_set + ) if is_list_like(index_col): # Forward fill values for MultiIndex index. @@ -1457,9 +1456,9 @@ def inspect_excel_format( with zipfile.ZipFile(stream) as zf: # Workaround for some third party files that use forward slashes and # lower case names. - component_names = [ + component_names = { name.replace("\\", "/").lower() for name in zf.namelist() - ] + } if "xl/workbook.xml" in component_names: return "xlsx" diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 69b514da32857..f79417d11080d 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -122,29 +122,25 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] for sheet_row in sheet_rows: - sheet_cells = [ - x - for x in sheet_row.childNodes - if hasattr(x, "qname") and x.qname in cell_names - ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] - for sheet_cell in sheet_cells: - if sheet_cell.qname == table_cell_name: - value = self._get_cell_value(sheet_cell) - else: - value = self.empty_value - - column_repeat = self._get_column_repeat(sheet_cell) - - # Queue up empty values, writing only if content succeeds them - if value == self.empty_value: - empty_cells += column_repeat - else: - table_row.extend([self.empty_value] * empty_cells) - empty_cells = 0 - table_row.extend([value] * column_repeat) + for sheet_cell in sheet_row.childNodes: + if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names: + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) if max_row_len < len(table_row): max_row_len = len(table_row) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a444970792e6e..5d39a840336eb 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] - nrows = sheet.nrows if file_rows_needed is not None: nrows = min(nrows, file_rows_needed) - for i in range(nrows): - row = [ + return [ + [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) ] - data.append(row) - - return data + for i in range(nrows) + ] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c0007c5e7d78c..874320f08fb75 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) + idx_len = content.shape[0] arrays = convert_object_array( list(content.T), dtype=None, @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe( result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: - df = DataFrame(dict(zip(range(len(columns)), arrays))) - df.columns = columns - return df + return DataFrame._from_arrays( + arrays, columns=columns, index=range(idx_len), verify_integrity=False + ) else: return DataFrame(columns=columns) From b5b2d380b5ff15e1f9911e8905fac6cd975e17e5 Mon Sep 17 00:00:00 2001 From: r0rshark Date: Wed, 22 May 2024 23:00:45 +0200 Subject: [PATCH 05/66] DOC: Document Remote Code Execution risk for Dataframe.query and computation.eval (#58697) --- pandas/core/computation/eval.py | 2 ++ pandas/core/frame.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index c949cfd1bc657..fee08c6199eef 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -193,6 +193,8 @@ def eval( corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. + `eval` can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c875ec78891d6..01ac5a2be3d79 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4472,6 +4472,9 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No """ Query the columns of a DataFrame with a boolean expression. + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Parameters ---------- expr : str From 3b48b17e52f3f3837b9ba8551c932f44633b5ff8 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 23 May 2024 19:03:46 +0300 Subject: [PATCH 06/66] DOC: Instruct to run git bisect when triaging a regression report (#58813) * DOC: Instruct to run git bisect when triaging a regression report * Fix typo * Suggest to add regression label --- doc/source/development/maintaining.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 4f4a1cd811b61..fbcf017d608ce 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -84,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue. example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it's *clearly* not minimal, feel free to ask the reporter if they can provide - and example or simplify the provided one. Do acknowledge that writing + an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we'll edit the original post to include it. @@ -93,6 +93,9 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + If this is a regression report, post the result of a ``git bisect`` run. + More info on this can be found in the :ref:`maintaining.regressions` section. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag until all steps have been completed. Add a comment to the issue once you have verified it exists on the main branch, so others know it has been confirmed. @@ -125,7 +128,10 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Once you have completed the above, make sure to remove the "needs triage" label. + If the issue is a regression report, add the "Regression" label and the next patch + release milestone. + + Once you have completed the above, make sure to remove the "Needs Triage" label. .. _maintaining.regressions: From b162331554d7c7f6fd46ddde1ff3908f2dc8bcce Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Sat, 25 May 2024 15:41:01 -0500 Subject: [PATCH 07/66] BUG: Let `melt` name multiple variable columns for labels from a `MultiIndex` (#58088) * Let melt name variable columns for a multiindex * Respond to comments * Check is_iterator --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/melt.py | 21 +++++++++++++++++---- pandas/tests/reshape/test_melt.py | 20 ++++++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a15da861cfbec..6a6abcf2d48fe 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -440,6 +440,7 @@ Missing MultiIndex ^^^^^^^^^^ - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) +- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - I/O diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index b4720306094e9..294de2cf2fe1d 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -5,7 +5,10 @@ import numpy as np -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import ( + is_iterator, + is_list_like, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import notna @@ -64,9 +67,10 @@ def melt( value_vars : scalar, tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. - var_name : scalar, default None + var_name : scalar, tuple, list, or ndarray, optional Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. + ``frame.columns.name`` or 'variable'. Must be a scalar if columns are a + MultiIndex. value_name : scalar, default 'value' Name to use for the 'value' column, can't be an existing column label. col_level : scalar, optional @@ -217,7 +221,16 @@ def melt( frame.columns.name if frame.columns.name is not None else "variable" ] elif is_list_like(var_name): - raise ValueError(f"{var_name=} must be a scalar.") + if isinstance(frame.columns, MultiIndex): + if is_iterator(var_name): + var_name = list(var_name) + if len(var_name) > len(frame.columns): + raise ValueError( + f"{var_name=} has {len(var_name)} items, " + f"but the dataframe columns only have {len(frame.columns)} levels." + ) + else: + raise ValueError(f"{var_name=} must be a scalar.") else: var_name = [var_name] diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index f224a45ca3279..49200face66c5 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -533,6 +533,26 @@ def test_melt_non_scalar_var_name_raises(self): with pytest.raises(ValueError, match=r".* must be a scalar."): df.melt(id_vars=["a"], var_name=[1, 2]) + def test_melt_multiindex_columns_var_name(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + expected = DataFrame( + [("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"] + ) + + tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected) + tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]]) + + def test_melt_multiindex_columns_var_name_too_many(self): + # GH 58033 + df = DataFrame({("A", "a"): [1], ("A", "b"): [2]}) + + with pytest.raises( + ValueError, match="but the dataframe columns only have 2 levels" + ): + df.melt(var_name=["first", "second", "third"]) + class TestLreshape: def test_pairs(self): From a5646621e3ca36b44a4f5623fe7aecda2ea21d47 Mon Sep 17 00:00:00 2001 From: Sergey B Kirpichev Date: Thu, 30 May 2024 23:20:17 +0300 Subject: [PATCH 08/66] CLN: use isnan() instead of the Py_IS_NAN macro (#58850) --- .../include/pandas/vendored/klib/khash_python.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 811fdd139de2c..8d4c382241d39 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -156,7 +156,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) // NaN-floats should be in the same equivalency class, see GH 22119 static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { - return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + return (isnan(PyFloat_AS_DOUBLE(a)) && isnan(PyFloat_AS_DOUBLE(b))) || (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } @@ -164,12 +164,12 @@ static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { - return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || - (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + return (isnan(a->cval.real) && isnan(b->cval.real) && isnan(a->cval.imag) && + isnan(b->cval.imag)) || + (isnan(a->cval.real) && isnan(b->cval.real) && a->cval.imag == b->cval.imag) || - (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && isnan(a->cval.imag) && + isnan(b->cval.imag)) || (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } @@ -223,7 +223,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { static inline Py_hash_t _Pandas_HashDouble(double val) { // Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { + if (isnan(val)) { return 0; } #if PY_VERSION_HEX < 0x030A0000 From 5e972376fd5e4ab033f9922b546495d8efc9fda5 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Fri, 31 May 2024 13:18:47 -0400 Subject: [PATCH 09/66] DOC: Add note about deprecated offset aliases (#58861) DOC: Add note about alias deprecations The PRs #55792 (Y), #52064 (Q), and #55553 (M) deprecated the single letter version of the aliases in favour of the -end version of them. Add a note to the offset table about deprecations. --- doc/source/user_guide/timeseries.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index b31249e1cf7c1..ab3f5b314ed83 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1273,6 +1273,10 @@ frequencies. We will refer to these aliases as *offset aliases*. are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``Y``, ``M``, and ``Q`` are deprecated in favour of the aliases + ``YE``, ``ME``, ``QE``. + + .. note:: When using the offset aliases above, it should be noted that functions From 0347e435e12ab4736cc6cc872dbdd560b453590c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:40:14 +0530 Subject: [PATCH 10/66] DOC: fix PR01 for pandas.MultiIndex (#58833) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a93e23a0f5022..95ac6d457431e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.MultiIndex PR01" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3927619a567bf..1868081b0b0dc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -209,8 +209,12 @@ class MultiIndex(Index): level). names : optional sequence of objects Names for each of the index levels. (name is accepted for compat). + dtype : Numpy dtype or pandas type, optional + Data type for the MultiIndex. copy : bool, default False Copy the meta-data. + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. verify_integrity : bool, default True Check that the levels/codes are consistent and valid. From 36e2fb78bb69ff4c4156f93ce096746576e6484b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:41:01 +0530 Subject: [PATCH 11/66] DOC: fix SA01 for pandas.MultiIndex.dtypes (#58834) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 95ac6d457431e..fde287a8e060c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -79,7 +79,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.dtypes SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1868081b0b0dc..2568cd9bd18b5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -775,6 +775,11 @@ def dtypes(self) -> Series: """ Return the dtypes as a Series for the underlying MultiIndex. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Series.dtypes : Return the data type of the underlying Series. + Examples -------- >>> idx = pd.MultiIndex.from_product( From 5df0581da65865877f1d91a6b6eb7eed4f2bc071 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:42:11 +0530 Subject: [PATCH 12/66] DOC: fix SA01 for pandas.MultiIndex.levels (#58835) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fde287a8e060c..8f4efdffc090c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ - -i "pandas.MultiIndex.levels SA01" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.nlevels SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2568cd9bd18b5..69d4627f96044 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -835,6 +835,12 @@ def levels(self) -> FrozenList: it filters out all rows of the level C, MultiIndex.levels will still return A, B, C. + See Also + -------- + MultiIndex.codes : The codes of the levels in the MultiIndex. + MultiIndex.get_level_values : Return vector of label values for requested + level. + Examples -------- >>> index = pd.MultiIndex.from_product( From 35219f159655e4aa5ab66a183340dca7a343ccf1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:43:15 +0530 Subject: [PATCH 13/66] DOC: fix SA01 for pandas.MultiIndex.set_codes (#58836) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8f4efdffc090c..b2fc55d5fe72c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -87,7 +87,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.nlevels SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.set_codes SA01" \ -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 69d4627f96044..43f63a733fbaf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1149,6 +1149,12 @@ def set_codes( new index (of same type and class...etc) or None The same type as the caller or None if ``inplace=True``. + See Also + -------- + MultiIndex.set_levels : Set new levels on MultiIndex. + MultiIndex.codes : Get the codes of the levels in the MultiIndex. + MultiIndex.levels : Get the levels of the MultiIndex. + Examples -------- >>> idx = pd.MultiIndex.from_tuples( From 07e8e54813e15bbe0466d68b810f14cbe3a2e05c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:43:54 +0530 Subject: [PATCH 14/66] DOC: fix SA01 for pandas.MultiIndex.truncate (#58837) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b2fc55d5fe72c..6614611e52224 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -90,7 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ - -i "pandas.MultiIndex.truncate SA01" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ -i "pandas.NamedAgg SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 43f63a733fbaf..8b11f8087db94 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3611,6 +3611,11 @@ def truncate(self, before=None, after=None) -> MultiIndex: MultiIndex The truncated MultiIndex. + See Also + -------- + DataFrame.truncate : Truncate a DataFrame before and after some index values. + Series.truncate : Truncate a Series before and after some index values. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a", "b", "c"], ["x", "y", "z"]]) From fa937aa2050b1084b47509de9a2e7e4cad1e0f9c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:44:39 +0530 Subject: [PATCH 15/66] DOC: fix SA01 for pandas.Period.is_leap_year (#58839) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6614611e52224..594da4f2bd1e3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.is_leap_year SA01" \ -i "pandas.Period.month SA01" \ -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 838b5b9f4595f..4ca10850ab839 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2443,6 +2443,12 @@ cdef class _Period(PeriodMixin): """ Return True if the period's year is in a leap year. + See Also + -------- + Timestamp.is_leap_year : Check if the year in a Timestamp is a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. + Examples -------- >>> period = pd.Period('2022-01', 'M') From 8dc5a3f94d49b9ffbf5c4ed5ac8ba9942c876e17 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:45:14 +0530 Subject: [PATCH 16/66] DOC: fix SA01 for pandas.Period (#58838) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 594da4f2bd1e3..d4bebcf18e800 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -93,7 +93,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ -i "pandas.NamedAgg SA01" \ - -i "pandas.Period SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 4ca10850ab839..ddde4c68820f6 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2699,6 +2699,12 @@ class Period(_Period): second : int, default 0 Second value of the period. + See Also + -------- + Timestamp : Pandas replacement for python datetime.datetime object. + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Generates a fixed frequency range of timedeltas. + Examples -------- >>> period = pd.Period('2012-1-1', freq='D') From 79f4c5353dc10799355b82f97aeab99f6e5d288f Mon Sep 17 00:00:00 2001 From: Anish Karthik <89824626+anishfish2@users.noreply.github.com> Date: Fri, 31 May 2024 13:16:16 -0500 Subject: [PATCH 17/66] DOC: Fix docstring error SA01 for pandas.Dataframe.plot, pandas.Series.plot, pandas.core.groupby.DataFrameGroupBy.plot, pandas.core.groupby.SeriesGroupBy.plot (#58842) * Updated See Also for pandas.Dataframe.plot - S01 * Updating pandas.Series.plot to pass SA01 docstring test * Updating pandas.core.groupby.DataFrameGroupBy.plot to pass SA01 docstring test * Updating pandas.core.groupby.SeriesGroupBy.plot to pass SA01 docstring test --- ci/code_checks.sh | 8 ++++---- pandas/plotting/_core.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d4bebcf18e800..dee40075bcd74 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -74,7 +74,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.plot PR02,SA01" \ + -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ @@ -165,7 +165,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.lt SA01" \ -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02,SA01" \ + -i "pandas.Series.plot PR02" \ -i "pandas.Series.pop RT03,SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ @@ -360,7 +360,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ @@ -378,7 +378,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ea5daf02b7252..c83985917591c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -791,6 +791,21 @@ class PlotAccessor(PandasObject): If the backend is not the default matplotlib one, the return value will be the object returned by the backend. + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + Notes ----- - See matplotlib documentation online for more on this subject From 454b0af783c15cf28b930f8e1f9c67f0e9c0ba10 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:46:52 +0530 Subject: [PATCH 18/66] DOC: fix SA01 for pandas.Period.quarter (#58841) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dee40075bcd74..b9b3ca24b4162 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -99,7 +99,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.month SA01" \ -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.quarter SA01" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ -i "pandas.Period.year SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ddde4c68820f6..023a0f52e320f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2329,6 +2329,12 @@ cdef class _Period(PeriodMixin): """ Return the quarter this Period falls on. + See Also + -------- + Timestamp.quarter : Return the quarter of the Timestamp. + Period.year : Return the year of the period. + Period.month : Return the month of the period. + Examples -------- >>> period = pd.Period('2022-04', 'M') From 8593554b128392f5ad2422f574e990dba451fea9 Mon Sep 17 00:00:00 2001 From: James Yuill <130482796+jamesyuill@users.noreply.github.com> Date: Fri, 31 May 2024 19:20:32 +0100 Subject: [PATCH 19/66] corrects-spelling-mistake-in-merging.rst (#58874) Changed 'mactches' to 'matches' --- doc/source/user_guide/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index f3b849dc6de45..cfd2f40aa93a3 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -974,7 +974,7 @@ with optional filling of missing data with ``fill_method``. :func:`merge_asof` --------------------- -:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +:func:`merge_asof` is similar to an ordered left-join except that matches are on the nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. From 528d176227d6be4e97bd6d0454d1428473540adb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 31 May 2024 20:21:43 +0200 Subject: [PATCH 20/66] DOC: Fix reference to api.typing.NaType (#58871) --- doc/source/user_guide/missing_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 5149bd30dbbef..29f3fea899336 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -32,7 +32,7 @@ use :class:`api.typing.NaTType`. :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), :class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. These types will maintain the original data type of the data. -For typing applications, use :class:`api.types.NAType`. +For typing applications, use :class:`api.typing.NAType`. .. ipython:: python From f95c3a0d5dee3bce14513fa9a343f6000648efb7 Mon Sep 17 00:00:00 2001 From: santhoshbethi <45058712+santhoshbethi@users.noreply.github.com> Date: Fri, 31 May 2024 14:33:18 -0400 Subject: [PATCH 21/66] upadting doc string (#58830) * upadting doc string * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Empty-Commit * updating doc string :msg --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/tests/arithmetic/common.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index d7a8b0510b50f..0730729e2fd94 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -20,13 +20,16 @@ def assert_cannot_add(left, right, msg="cannot add"): """ - Helper to assert that left and right cannot be added. + Helper function to assert that two objects cannot be added. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str, default "cannot add" + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right @@ -36,13 +39,17 @@ def assert_cannot_add(left, right, msg="cannot add"): def assert_invalid_addsub_type(left, right, msg=None): """ - Helper to assert that left and right can be neither added nor subtracted. + Helper function to assert that two objects can + neither be added nor subtracted. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str or None, default None + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right From a2a78d33f5f9d730225717233624dc8f41bd4e2f Mon Sep 17 00:00:00 2001 From: mutricyl <118692416+mutricyl@users.noreply.github.com> Date: Fri, 31 May 2024 20:38:47 +0200 Subject: [PATCH 22/66] updating df.query and df.eval docstrings. resolves #16283 (#58749) * updating df.query and df.eval docstrings. resolves #16283 * typo * adding 1 example * changing wording following example added * updating 'C C' to 'C&C' for eval * updating 'C C' to 'C&C' for query --------- Co-authored-by: Laurent Mutricy --- pandas/core/frame.py | 109 +++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01ac5a2be3d79..912b4353acacf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4577,36 +4577,44 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No Examples -------- >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C C": range(10, 5, -1)} + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} ... ) >>> df - A B C C + A B C&C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query("A > B") - A B C C + A B C&C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C C + A B C&C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query("B == `C C`") - A B C C + >>> df.query("B == `C&C`") + A B C&C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df["C C"]] - A B C C + >>> df[df.B == df["C&C"]] + A B C&C 0 1 10 10 + + Using local variable: + + >>> local_var = 2 + >>> df.query("A <= @local_var") + A B C&C + 0 1 10 10 + 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): @@ -4647,6 +4655,13 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable + names by surrounding them with backticks `````. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4678,14 +4693,16 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.eval("A + B") 0 11 1 10 @@ -4697,35 +4714,55 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval("C = A + B") - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 + >>> df.eval("D = A + B") + A B C&C D + 0 1 10 10 11 + 1 2 8 9 10 + 2 3 6 8 9 + 3 4 4 7 8 + 4 5 2 6 7 >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... C = A + B - ... D = A - B + ... D = A + B + ... E = A - B ... ''' ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + A B C&C D E + 0 1 10 10 11 -9 + 1 2 8 9 10 -6 + 2 3 6 8 9 -3 + 3 4 4 7 8 0 + 4 5 2 6 7 3 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.eval("B * `C&C`") + 0 100 + 1 72 + 2 48 + 3 28 + 4 12 + + Local variables shall be explicitly referenced using ``@`` + character in front of the name: + + >>> local_var = 2 + >>> df.eval("@local_var * A") + 0 2 + 1 4 + 2 6 + 3 8 + 4 10 """ from pandas.core.computation.eval import eval as _eval From 2ea036f0491417786a662c320e504d7eb9aba83c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 31 May 2024 11:40:04 -0700 Subject: [PATCH 23/66] ENH/WIP: resolution inference in pd.to_datetime, DatetimeIndex (#55901) * ENH: read_stata return non-nano * GH ref * move whatsnew * remove outdated whatsnew * ENH: read_stata return non-nano * avoid Series.view * dont go through Series * TST: dt64 units * BUG: cut with non-nano * BUG: round with non-nanosecond raising OverflowError * woops * BUG: cut with non-nano * TST: parametrize tests over dt64 unit * xfail non-nano * revert * BUG: mixed-type mixed-timezone/awareness * commit so i can unstash something else i hope * ENH: infer resolution in to_datetime, DatetimeIndex * revert commented-out * revert commented-out * revert commented-out * remove commented-out * remove comment * revert unnecessary * revert unnecessary * fix window tests * Fix resample tests * restore comment * revert unnecessary * remove no-longer necessary * revert no-longer-necessary * revert no-longer-necessary * update tests * revert no-longer-necessary * update tests * revert bits * update tests * cleanup * revert * revert * parametrize over unit * update tests * update tests * revert no-longer-needed * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * Revert no-longer-necessary * update test * update test * simplify * update tests * update tests * update tests * revert no-longer-necessary * post-merge fixup * revert no-longer-necessary * update tests * update test * update tests * update tests * remove commented-out * revert no-longer-necessary * as_unit->astype * cleanup * merge fixup * revert bit * revert no-longer-necessary, xfail * update multithread test * update tests * update doctest * update tests * update doctests * update tests * update db tests * troubleshoot db tests * update test * troubleshoot sql tests * update test * update tests * mypy fixup * Update test * kludge test * update test * update for min-version tests * fix adbc check * troubleshoot minimum version deps * troubleshoot * troubleshoot * troubleshoot * whatsnew * update abdc-driver-postgresql minimum version * update doctest * fix doc example * troubleshoot test_api_custom_dateparsing_error * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * update exp instead of object cast * revert accidental * simplify test --- doc/source/whatsnew/v3.0.0.rst | 63 ++++++ pandas/_libs/lib.pyx | 10 +- pandas/_libs/tslib.pyx | 17 +- pandas/_libs/tslibs/strptime.pyx | 6 +- pandas/core/algorithms.py | 8 +- pandas/core/arrays/datetimelike.py | 12 +- pandas/core/arrays/datetimes.py | 35 ++-- pandas/core/base.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/missing.py | 4 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 10 +- pandas/core/groupby/generic.py | 4 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 5 +- pandas/core/series.py | 5 +- pandas/core/tools/datetimes.py | 16 +- pandas/tests/arithmetic/test_period.py | 7 +- .../tests/arrays/categorical/test_missing.py | 2 +- pandas/tests/arrays/test_array.py | 10 +- pandas/tests/base/test_constructors.py | 6 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/dtypes/test_inference.py | 6 +- pandas/tests/extension/test_arrow.py | 2 +- .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 3 +- .../tests/frame/methods/test_combine_first.py | 25 ++- .../tests/frame/methods/test_infer_objects.py | 2 +- pandas/tests/frame/methods/test_map.py | 7 +- pandas/tests/frame/methods/test_to_csv.py | 41 +++- pandas/tests/frame/test_constructors.py | 61 ++++-- pandas/tests/groupby/test_apply.py | 4 +- pandas/tests/groupby/test_groupby.py | 2 +- .../indexes/datetimes/test_constructors.py | 62 +++--- .../indexes/datetimes/test_date_range.py | 4 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_index_new.py | 11 +- pandas/tests/indexing/test_coercion.py | 20 +- pandas/tests/indexing/test_loc.py | 7 +- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/interchange/test_impl.py | 3 +- pandas/tests/io/excel/test_readers.py | 9 +- pandas/tests/io/excel/test_writers.py | 16 +- pandas/tests/io/json/test_pandas.py | 17 +- .../io/parser/common/test_common_basic.py | 9 +- pandas/tests/io/parser/common/test_index.py | 3 +- pandas/tests/io/parser/test_multi_thread.py | 9 +- pandas/tests/io/parser/test_parse_dates.py | 58 ++++-- pandas/tests/io/parser/test_read_fwf.py | 4 +- pandas/tests/io/parser/test_skiprows.py | 8 +- .../io/parser/usecols/test_parse_dates.py | 2 +- pandas/tests/io/pytables/test_store.py | 12 +- pandas/tests/io/test_fsspec.py | 16 +- pandas/tests/io/test_gcs.py | 6 +- pandas/tests/io/test_html.py | 14 +- pandas/tests/io/test_orc.py | 2 + pandas/tests/io/test_parquet.py | 44 +++- pandas/tests/io/test_sql.py | 52 +++-- pandas/tests/io/test_stata.py | 23 +- pandas/tests/resample/test_base.py | 3 +- pandas/tests/resample/test_time_grouper.py | 12 +- .../reshape/concat/test_append_common.py | 4 +- pandas/tests/reshape/concat/test_datetimes.py | 4 +- pandas/tests/reshape/test_cut.py | 52 +++-- pandas/tests/reshape/test_qcut.py | 4 +- pandas/tests/scalar/test_nat.py | 6 +- .../series/methods/test_combine_first.py | 2 +- pandas/tests/series/methods/test_fillna.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 5 +- pandas/tests/series/test_constructors.py | 34 ++- pandas/tests/test_algos.py | 1 + pandas/tests/tools/test_to_datetime.py | 197 +++++++++++------- pandas/tests/tools/test_to_timedelta.py | 2 +- pandas/tests/tseries/holiday/test_calendar.py | 4 +- pandas/tests/tslibs/test_array_to_datetime.py | 50 +++-- pandas/tests/util/test_hashing.py | 16 +- 77 files changed, 745 insertions(+), 457 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6a6abcf2d48fe..865996bdf8892 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -124,6 +124,69 @@ notable_bug_fix2 Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype('>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -365,7 +366,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 925858a20ce41..673001337767b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1849,11 +1849,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1876,11 +1876,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1903,11 +1903,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b075e3d299ed0..bbbf7a9b4a63a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -218,7 +218,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -613,7 +613,7 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc @@ -1047,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1059,14 +1059,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1427,7 +1427,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1470,7 +1470,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1512,7 +1512,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1861,7 +1861,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1897,7 +1897,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1933,7 +1933,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -2372,9 +2372,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2400,7 +2400,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2508,7 +2508,7 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", + out_unit: str | None = None, ) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2524,7 +2524,8 @@ def objects_to_datetime64( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index 5cdbde8c64c47..b784dc8b03292 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1334,7 +1334,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') 3 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08adb580ff08f..662b8c5791e51 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1193,7 +1193,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + dtype_if_all_nat=np.dtype("M8[s]"), ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 45814ca77b70f..5213be8b69016 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -205,7 +205,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype(' bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) @@ -362,7 +362,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 912b4353acacf..97a4e414608b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13286,7 +13286,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca60ca9b48a14..22eecdc95934f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3209,7 +3209,7 @@ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -6194,7 +6194,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -10653,10 +10653,10 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series( - ... range(2), - ... index=pd.DatetimeIndex(["2015-03-29 02:30:00", "2015-03-29 03:30:00"]), + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" ... ) + >>> s = pd.Series(range(2), index=dti) >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a20577e8d3df9..0c4f22f736d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1206,7 +1206,7 @@ def idxmin(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmin() a 2023-01-01 b 2023-02-01 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmin", skipna=skipna) @@ -1259,7 +1259,7 @@ def idxmax(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmax() a 2023-01-15 b 2023-02-15 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmax", skipna=skipna) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a3fb8bc851df..56030a15dc143 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2634,7 +2634,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78f04f57029b1..930bc7a95bd14 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -242,7 +242,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -473,7 +473,8 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- >>> idx = pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], diff --git a/pandas/core/series.py b/pandas/core/series.py index c49eef49f7393..f67c0753fa9df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2060,14 +2060,14 @@ def unique(self) -> ArrayLike: >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] >>> pd.Series( ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. @@ -3175,6 +3175,7 @@ def combine_first(self, other) -> Series: other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) combined = combined.reindex(new_index) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..c116ef015ae16 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -29,6 +29,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -524,6 +525,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, ) result = DatetimeIndex(arr, name=name) @@ -873,7 +875,7 @@ def to_datetime( >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time @@ -903,7 +905,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -916,14 +918,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) @@ -965,21 +967,21 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 18f1993c198df..539df9d61a7b2 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -121,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 857509e18fa8e..97d57163ed079 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -125,7 +125,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -301,11 +301,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -321,7 +321,7 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( @@ -330,7 +330,7 @@ def test_array_copy(): datetime.datetime(2001, 1, 1, tzinfo=cet), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") ), ), # timedelta diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..c4b02423f8cf0 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 6c0df49b0a93a..dd6bf3c7521f8 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -412,7 +412,7 @@ def test_to_numpy_dtype(as_series): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +454,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f4282c9c7ac3a..db18cd4aef14e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -830,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..5926d23b44dd0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3445,7 +3445,7 @@ def test_arrow_floor_division_large_divisor(dtype): def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 66fc234e79b4d..35e143fcedf7b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -39,7 +39,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " None: ) with tm.assert_produces_warning(FutureWarning, match=msg): result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + expected = DatetimeIndex( + ["2010-01-31", "2010-02-28"], dtype="M8[ns]", freq="ME" + ) tm.assert_index_equal(result, expected) def test_date_range_bday(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2e94961b673f8..bd38e6c2ff333 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -186,7 +186,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..4a31ae88a757a 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -61,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -138,6 +138,9 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d4bc0341e732e..84cd0d3b08b7b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -598,7 +598,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -615,7 +615,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -630,10 +630,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -650,7 +650,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -830,6 +830,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -850,7 +851,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) @@ -867,7 +867,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -891,7 +891,7 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -900,8 +900,8 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - else: - assert exp.dtype == to_key + elif to_key == from_key: + exp = exp.dt.as_unit("ns") result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 01dab14c7e528..16f3e0fd0c229 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -711,7 +711,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -865,6 +865,7 @@ def test_loc_setitem_frame_multiples(self): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -1814,7 +1815,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -2082,7 +2083,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..4d232d5ed1312 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -580,7 +580,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 60e05c2c65124..64eca6ac643ca 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -603,7 +603,8 @@ def test_empty_dataframe(): ), ( pd.Series( - [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", ), (DtypeKind.DATETIME, 64, "tsn:", "="), (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..6d6c3ad6b77a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -141,10 +141,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -1117,7 +1120,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1675,6 +1677,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 859152db84b7d..744fe20e4995d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -37,7 +37,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -293,12 +295,15 @@ def test_read_excel_parse_dates(self, tmp_excel): tm.assert_frame_equal(df2, res) res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) - tm.assert_frame_equal(df, res) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 @@ -547,6 +552,7 @@ def test_sheets(self, frame, tmp_excel): columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -695,7 +701,6 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # # Excel output format strings unit = get_exp_unit(tmp_excel) - df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -732,6 +737,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") + tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4065ea01988f..b53957a7e77d1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -133,7 +133,13 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) expected_warning = None @@ -141,7 +147,7 @@ def test_frame_non_unique_columns(self, orient, data): "The default 'epoch' date format is deprecated and will be removed " "in a future version, please use 'iso' date format instead." ) - if df.iloc[:, 0].dtype == "datetime64[ns]": + if df.iloc[:, 0].dtype == "datetime64[s]": expected_warning = FutureWarning with tm.assert_produces_warning(expected_warning, match=msg): @@ -150,7 +156,7 @@ def test_frame_non_unique_columns(self, orient, data): ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need @@ -856,6 +862,10 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") expected_warning = None if date_format == "epoch": @@ -897,6 +907,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index df76b46cc6a7b..b665cfba8bdc0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -40,9 +40,7 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -64,6 +62,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -144,9 +143,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -168,6 +164,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 2fcc80f58ae30..4cfc12cdc46aa 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -260,7 +260,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 649a1324686a7..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -152,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 3bb3d793606e1..e9c6c0f5e32d7 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -62,6 +62,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -90,7 +91,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -126,7 +127,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -143,6 +144,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -166,9 +169,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -209,9 +213,6 @@ def test_parse_tz_aware(all_parsers): data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -302,6 +303,7 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -312,18 +314,22 @@ def test_parse_dates_empty_string(all_parsers): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -331,7 +337,13 @@ def test_parse_dates_empty_string(all_parsers): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -399,6 +411,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -437,7 +450,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -470,7 +483,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -555,9 +568,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -591,6 +602,7 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -600,7 +612,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -661,7 +673,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -716,7 +728,8 @@ def test_replace_nans_before_parsing_dates(all_parsers): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -731,6 +744,7 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -750,7 +764,7 @@ def test_parse_dot_separated_dates(all_parsers): else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -783,7 +797,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -816,9 +831,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0a9f6bd83e0d9..45d630c545565 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -298,7 +298,8 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) @@ -311,6 +312,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 17a806d05fe28..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 0cf3fe894c916..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -70,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 471f7b8958ee4..3ce30e313cc30 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -613,10 +613,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f6fb032b9d51a..c609ae999d47d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -72,7 +72,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +97,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +110,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +134,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b2be41d0c9f9..17b89c9f31616 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -107,7 +107,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 594c1d02b94cc..dfc9b4156ecab 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1044,11 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index de6d46492e916..c7d9300c0a638 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -321,6 +321,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2860b3a6483af..35275f3c23bef 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -670,6 +670,7 @@ def test_read_empty_array(self, pa, dtype): class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full pytest.importorskip("pyarrow", "11.0.0") @@ -706,6 +707,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -961,7 +970,11 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -988,13 +1001,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1018,6 +1032,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1107,9 +1122,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out class TestParquetFastParquet(Base): + @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): df = df_full @@ -1254,6 +1271,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + check_round_trip(df, fp, expected=expected) + + @pytest.mark.xfail( + reason="fastparquet passed mismatched values/dtype to DatetimeArray " + "constructor, see https://github.com/dask/fastparquet/issues/891" + ) + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) + def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6058f34d25ad3..df821fb740af8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,10 +19,7 @@ import pytest from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -368,7 +365,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -1824,7 +1821,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1847,10 +1844,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2835,7 +2834,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2847,7 +2848,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2865,7 +2866,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2876,7 +2877,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2885,7 +2888,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2937,7 +2940,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2945,9 +2951,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2962,16 +2966,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3963,6 +3968,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2f981953a6237..d5134a3e3afd0 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -181,9 +181,7 @@ def test_read_dta2(self, datapath): expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") - expected["yearly_date"] = ( - expected["yearly_date"].astype("Period[s]").array.view("M8[s]") - ) + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -206,9 +204,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -952,8 +950,8 @@ def test_big_dates(self, datapath, temp_file): parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} @@ -965,7 +963,6 @@ def test_big_dates(self, datapath, temp_file): tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -1252,7 +1249,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1344,7 +1343,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 3428abacd509e..f4ea6b1d3f3de 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -557,7 +557,8 @@ def test_first_last_skipna(any_real_nullable_dtype, skipna, how): method = getattr(rs, how) result = method(skipna=skipna) - gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) expected = getattr(gb, how)(skipna=skipna) expected.index.freq = "ME" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5f5a54c4d92a3..2646106b9b97c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -421,11 +421,13 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) volume = [50, 50, 60] - week_starting = [ - Timestamp("2018-01-07"), - Timestamp("2018-01-18 01:00:00"), - Timestamp("2018-01-14"), - ] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index c831cb8293943..afafe8f6ab264 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -19,12 +19,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 3e046b2df72d8..89a3c3c5ed8bc 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -213,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 @@ -358,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 340c5c449aea7..d8bb4fba1e1fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 53af673e0f7b0..5f769db7f8acf 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -271,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e352e2601cef3..131be7a77f2e5 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -439,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 0f2f533c8feff..293919173c2d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 592dba253532d..c10bb8278a3d1 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -411,7 +411,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index f7dec02ab0e5b..488d0cb9fe9da 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -31,7 +31,9 @@ def test_from_csv(self, datetime_series, string_series, temp_file): path = temp_file datetime_series.to_csv(path, header=False) ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) assert ts.name is None assert ts.index.name is None @@ -57,6 +59,7 @@ def test_from_csv(self, datetime_series, string_series, temp_file): series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") tm.assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..00c614cf72c20 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -752,7 +752,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -934,7 +934,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -962,15 +962,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1084,16 +1084,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1155,7 +1155,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1215,7 +1215,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1356,14 +1356,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1408,7 +1402,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6da6ad27f853f..134ebededd163 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1264,6 +1264,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b05c30fa50fbe..cbbd018720bad 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -117,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -143,19 +145,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): + # coercion + # GH 7930, GH 14487 + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) + expected = Series( + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", + ) + tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -532,7 +547,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -563,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -579,7 +595,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -588,7 +604,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -596,7 +612,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -605,7 +621,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -625,6 +641,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -696,7 +714,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -704,7 +722,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -965,7 +983,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -988,12 +1006,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1030,7 +1048,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1091,11 +1109,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1106,14 +1120,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) def test_to_datetime_tz(self, cache): @@ -1126,7 +1133,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1145,7 +1152,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1177,7 +1184,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1264,7 +1271,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1273,15 +1280,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1351,16 +1358,20 @@ def test_datetime_invalid_scalar(self, value, format): def test_datetime_outofbounds_scalar(self, value, format): # GH24763 res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1433,15 +1444,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1449,11 +1462,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1463,7 +1477,7 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) @@ -1484,7 +1498,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1535,7 +1549,17 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1546,7 +1570,9 @@ def test_to_datetime_malformed_no_raise(self): UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): result = to_datetime(ts_strings, errors="coerce") - tm.assert_index_equal(result, Index([NaT, NaT])) + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 @@ -1594,7 +1620,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1620,7 +1646,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1641,9 +1667,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1726,7 +1754,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]) + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -2149,7 +2177,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2361,7 +2389,9 @@ def test_to_datetime_with_space_in_series(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2473,7 +2503,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2518,7 +2548,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2558,7 +2588,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2582,7 +2612,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2687,7 +2717,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2699,9 +2729,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2720,7 +2748,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2734,6 +2762,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2890,9 +2919,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -2907,7 +2943,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -2921,7 +2957,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -3197,9 +3233,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*, at position 0" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3331,7 +3374,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3371,14 +3414,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3463,7 +3504,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index ba000a0439dd1..894f49b2fa140 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -29,7 +29,7 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 35b72c9bb2887..3c55ae2c6f904 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -206,38 +207,35 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -247,7 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -292,5 +296,5 @@ def test_datetime_subclass(klass): arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(["2000-01-01T00:00:00.000000000"], dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a54e0071aa006..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -260,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) - - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) + + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected From bce10b46fe72e7d517eb2e874e8d4f60f5fced69 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 31 May 2024 18:41:47 +0000 Subject: [PATCH 24/66] PERF: more realistic np datetime c benchmark (#58165) * make input range for np_datetime.c benchmark more realistic * fix random numbers * fix import * add new benchmark --- asv_bench/benchmarks/tslibs/fields.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 3a2baec54109a..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -19,10 +19,15 @@ class TimeGetTimedeltaField: def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip From 6c55f5e7f145ecb5ee1e44f91f8167e59b285050 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 11:55:32 -0700 Subject: [PATCH 25/66] Bump pypa/cibuildwheel from 2.18.0 to 2.18.1 (#58840) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.18.0 to 2.18.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.18.0...v2.18.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ab0201ca623aa..d7a98671c42bc 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.18.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 9ada6084409b8db42b7542142b7ae817c2a87246 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:55:20 +0530 Subject: [PATCH 26/66] DOC: fix PR07,SA01 for pandas.util.hash_array (#58877) --- ci/code_checks.sh | 1 - pandas/core/util/hashing.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b9b3ca24b4162..b4ae6976c6916 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -788,7 +788,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ -i "pandas.unique PR07" \ - -i "pandas.util.hash_array PR07,SA01" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 3b9dd40a92ce8..e120e69dc27cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -244,6 +244,7 @@ def hash_array( Parameters ---------- vals : ndarray or ExtensionArray + The input array to hash. encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -257,6 +258,11 @@ def hash_array( ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. + See Also + -------- + util.hash_pandas_object : Return a data hash of the Index/Series/DataFrame. + util.hash_tuples : Hash an MultiIndex / listlike-of-tuples efficiently. + Examples -------- >>> pd.util.hash_array(np.array([1, 2, 3])) From a608dfa4942dd9c16bdaef2bf4bd21b8fa18ddd2 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:55:59 +0530 Subject: [PATCH 27/66] DOC: fix PR02 for pandas.tseries.offsets.YearEnd (#58878) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b4ae6976c6916..700b0bda1b916 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -779,7 +779,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd PR02" \ -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0d681e0c2aae6..f9d63065493c3 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2579,7 +2579,7 @@ cdef class YearEnd(YearOffset): YearEnd goes to the next date which is the end of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. From 71d067f60c1bf7d9fdddf307ac6c525c11359633 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:56:33 +0530 Subject: [PATCH 28/66] DOC: fix SA01 for pandas.timedelta_range (#58879) --- ci/code_checks.sh | 1 - pandas/core/indexes/timedeltas.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 700b0bda1b916..80248b4fd5944 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -494,7 +494,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_index_equal PR07,SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ - -i "pandas.timedelta_range SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8af5a56f43c57..29039ffd0217e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -273,6 +273,11 @@ def timedelta_range( TimedeltaIndex Fixed frequency, with day as the default. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, From 04fccd250f4ab84e2effda647154612076620cc3 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:57:18 +0530 Subject: [PATCH 29/66] DOC: fix PR07,SA01 for pandas.testing.assert_index_equal (#58880) --- ci/code_checks.sh | 1 - pandas/_testing/asserters.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 80248b4fd5944..6feff654a36f5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -492,7 +492,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.show_versions SA01" \ -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_index_equal PR07,SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 543d7944e4c5d..ecaa7e5507996 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -196,7 +196,9 @@ def assert_index_equal( Parameters ---------- left : Index + The first index to compare. right : Index + The second index to compare. exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for @@ -219,6 +221,11 @@ def assert_index_equal( Specify object name being compared, internally used to show appropriate assertion message. + See Also + -------- + testing.assert_series_equal : Check that two Series are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm From 55f32cddd3280140378dd6c2301fbf8d76129014 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:57:54 +0530 Subject: [PATCH 30/66] DOC: fix SA01 for pandas.test (#58881) --- ci/code_checks.sh | 1 - pandas/util/_tester.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6feff654a36f5..d79a42a6fef6b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -490,7 +490,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.set_option SA01" \ -i "pandas.show_versions SA01" \ - -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 494f306ec807d..c0e9756372f47 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -27,6 +27,10 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython" to extra_args. + See Also + -------- + pytest.main : The main entry point for pytest testing framework. + Examples -------- >>> pd.test() # doctest: +SKIP From 4a9ac5a94c1a460b27be7dea45e188ad1dc42bd5 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:58:28 +0530 Subject: [PATCH 31/66] DOC: fix SA01 for pandas.set_option (#58882) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d79a42a6fef6b..5f53705133816 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -488,7 +488,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.set_option SA01" \ -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1b91a7c3ee636..5bd6535b0343c 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -211,6 +211,14 @@ def set_option(*args) -> None: TypeError if keyword arguments are provided OptionError if no such option exists + See Also + -------- + get_option : Retrieve the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + option_context : Context manager to temporarily set options in a ``with`` + statement. + Notes ----- For all available options, please view the :ref:`User Guide ` From 979d51e0e8d914bbb739b1d9d557f6b3dabf8c27 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:59:01 +0530 Subject: [PATCH 32/66] DOC: fix SA01 for pandas.read_sas (#58883) * DOC: fix SA01 for pandas.read_sas * DOC: fix SA01 for pandas.read_sas --- ci/code_checks.sh | 1 - pandas/io/sas/sasreader.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5f53705133816..ef079cac0414b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -484,7 +484,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.qcut PR07,SA01" \ -i "pandas.read_feather SA01" \ -i "pandas.read_orc SA01" \ - -i "pandas.read_sas SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 12d698a4f76a8..6daf4a24781bd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -124,6 +124,14 @@ def read_sas( DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader, file format is inferred from file extension. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP From 84dab8937404e9f671e969338bf97e86b3948830 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:59:36 +0530 Subject: [PATCH 33/66] DOC: fix SA01 for pandas.read_feather (#58884) --- ci/code_checks.sh | 1 - pandas/io/feather_format.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ef079cac0414b..d5b4dd2f59bea 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -482,7 +482,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_feather SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index b42dbaa579ee7..8132167fbe05c 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -107,6 +107,14 @@ def read_feather( type of object stored in file DataFrame object stored in the file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_sas : Read SAS file into a pandas DataFrame. + Examples -------- >>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP From b0c4194c2fd63a84adc6e84eb77e2102e96d4b71 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 22:00:08 +0530 Subject: [PATCH 34/66] DOC: fix SA01 for pandas.plotting.plot_params (#58885) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d5b4dd2f59bea..230b4d3daf243 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -478,7 +478,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ - -i "pandas.plotting.plot_params SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index af7ddf39283c0..d79bb7152e6b4 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -617,6 +617,14 @@ class _Options(dict): the same as the plot function parameters, but is stored in a canonical format that makes it easy to breakdown into groups later. + See Also + -------- + plotting.register_matplotlib_converters : Register pandas formatters and + converters with matplotlib. + plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics. + plotting.autocorrelation_plot : Autocorrelation plot for time series. + plotting.lag_plot : Lag plot for time series. + Examples -------- From 1b48cef8749296b043ff7100853361c2429bb08f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:06:45 +0530 Subject: [PATCH 35/66] DOC: fix PR07 for pandas.unique (#58887) --- ci/code_checks.sh | 1 - pandas/core/algorithms.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 230b4d3daf243..e1181abda0010 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -779,7 +779,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ - -i "pandas.unique PR07" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b6d157eda8fd3..0d97f8a298fdb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -319,6 +319,8 @@ def unique(values): Parameters ---------- values : 1d array-like + The input array-like object containing values from which to extract + unique values. Returns ------- From 8643b2fb3faf8cbb4ebf47a35cc298744d0c991b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:07:35 +0530 Subject: [PATCH 36/66] DOC: fix PR07,SA01 for pandas.testing.assert_series_equal (#58888) --- ci/code_checks.sh | 1 - pandas/_testing/asserters.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e1181abda0010..153433160f482 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -487,7 +487,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index ecaa7e5507996..430840711122a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -857,7 +857,9 @@ def assert_series_equal( Parameters ---------- left : Series + First Series to compare. right : Series + Second Series to compare. check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' @@ -908,6 +910,11 @@ def assert_series_equal( .. versionadded:: 1.5.0 + See Also + -------- + testing.assert_index_equal : Check that two Indexes are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm From 39cc859debb80c223ac68d311604c15f706ad255 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:08:10 +0530 Subject: [PATCH 37/66] DOC: fix SA01 for pandas.show_versions (#58889) --- ci/code_checks.sh | 1 - pandas/util/_print_versions.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 153433160f482..f3e84a6e01331 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -485,7 +485,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 6cdd96996cea6..c4fec39594407 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -115,6 +115,11 @@ def show_versions(as_json: str | bool = False) -> None: Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + Examples -------- >>> pd.show_versions() # doctest: +SKIP From 23e4c3a925c2323cf36b8c86bc4fa84b0cbdcb85 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:08:37 +0530 Subject: [PATCH 38/66] DOC: fix SA01 for pandas.reset_option (#58890) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f3e84a6e01331..22106c35d3fb3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -483,7 +483,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.qcut PR07,SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ - -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 5bd6535b0343c..2fb258cc3e874 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -323,6 +323,12 @@ def reset_option(pat: str) -> None: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the From bef7c39f4214557bf02705ee6b41b88f4cdeed93 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:09:17 +0530 Subject: [PATCH 39/66] DOC: fix PR07,RT03,SA01 for pandas.plotting.table (#58892) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 22106c35d3fb3..c4666bbd3a5fa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -479,7 +479,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index d79bb7152e6b4..8d36a8767bc66 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -33,6 +33,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Parameters ---------- ax : Matplotlib axes object + The axes on which to draw the table. data : DataFrame or Series Data for table contents. **kwargs @@ -43,6 +44,12 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Returns ------- matplotlib table object + The created table as a matplotlib Table object. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.pyplot.table : Create a table from data in a Matplotlib plot. Examples -------- From 2e93c6336857c12c188cbb7386100593eb885efd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:09:44 +0530 Subject: [PATCH 40/66] DOC: fix SA01 for pandas.read_orc (#58891) --- ci/code_checks.sh | 1 - pandas/io/orc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c4666bbd3a5fa..d11287c033675 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -480,7 +480,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d4b4fd90658ad..3bca8ea7ef1df 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -85,6 +85,14 @@ def read_orc( DataFrame DataFrame based on the ORC file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_sas : Load a SAS file into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Notes ----- Before using this function you should read the :ref:`user guide about ORC ` From 8c2391192e0af276a37a0c4a869eb24de0c134ff Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:10:15 +0530 Subject: [PATCH 41/66] DOC: fix PR07,RT03,SA01 for pandas.plotting.parallel_coordinates (#58894) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d11287c033675..ea0fc1bb38427 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -477,7 +477,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ - -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ -i "pandas.read_spss SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 8d36a8767bc66..7b498c7c94178 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -479,6 +479,7 @@ def parallel_coordinates( Parameters ---------- frame : DataFrame + The DataFrame to be plotted. class_column : str Column name containing class names. cols : list, optional @@ -505,6 +506,13 @@ def parallel_coordinates( Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the parallel coordinates plot. + + See Also + -------- + plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters + of multivariate data. + plotting.radviz : Plot a multidimensional dataset in 2D. Examples -------- From e00e142235d943d5c71aa6fad2e0f91b27354035 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:10:48 +0530 Subject: [PATCH 42/66] DOC: fix RT03,SA01 for pandas.plotting.autocorrelation_plot (#58895) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ea0fc1bb38427..5300a6c03b87f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -475,7 +475,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.pivot PR07" \ -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7b498c7c94178..d8455f44ef0d1 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -606,6 +606,12 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the autocorrelation plot. + + See Also + -------- + Series.autocorr : Compute the lag-N autocorrelation for a Series. + plotting.lag_plot : Lag plot for time series. Examples -------- From 93d9561c038daba40fad9c15b5639bc17ee4a1e6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:11:49 +0530 Subject: [PATCH 43/66] DOC: fix SA01 for pandas.option_context (#58897) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5300a6c03b87f..1e615b6df8446 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -470,7 +470,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge PR07" \ -i "pandas.merge_asof PR07,RT03" \ -i "pandas.merge_ordered PR07" \ - -i "pandas.option_context SA01" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.pivot PR07" \ -i "pandas.pivot_table PR07" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 2fb258cc3e874..55d9e29686259 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -420,6 +420,13 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` From f49b286615a3a67d8bf41b007b18ce6e10f8d0de Mon Sep 17 00:00:00 2001 From: Cristina Yenyxe Gonzalez Garcia Date: Mon, 3 Jun 2024 20:06:41 +0200 Subject: [PATCH 44/66] Removed duplicated description of cumsum and cumprod methods (#58902) --- doc/source/user_guide/missing_data.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 29f3fea899336..69dfb406daa43 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -337,10 +337,8 @@ When taking the product, NA values or empty data will be treated as 1. pd.Series([], dtype="float64").prod() Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -ignore NA values by default preserve them in the result. This behavior can be changed -with ``skipna`` - -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +ignore NA values by default, but preserve them in the resulting array. To override +this behaviour and include NA values in the calculation, use ``skipna=False``. .. ipython:: python From ed4b8676f46bb259f8be2a6ba6466602002e0e4a Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:18:13 -0700 Subject: [PATCH 45/66] BUG: Check `min_periods` before applying the function (#58886) * Check min_periods before calling the function * Update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/window/numba_.py | 4 ++-- pandas/tests/window/test_numba.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 865996bdf8892..872c5b64892e1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -536,7 +536,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - +- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index eb06479fc325e..824cf936b8185 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -227,10 +227,10 @@ def roll_table( stop = end[i] window = values[start:stop] count_nan = np.sum(np.isnan(window), axis=0) - sub_result = numba_func(window, *args) nan_mask = len(window) - count_nan >= minimum_periods + if nan_mask.any(): + result[i, :] = numba_func(window, *args) min_periods_mask[i, :] = nan_mask - result[i, :] = sub_result result = np.where(min_periods_mask, result, np.nan) return result diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 650eb911e410b..3695ab8bf6cd3 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -67,6 +67,21 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + def test_numba_min_periods(self): + # GH 58868 + def last_row(x): + assert len(x) == 3 + return x[-1] + + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]]) + + result = df.rolling(3, method="table", min_periods=3).apply( + last_row, raw=True, engine="numba" + ) + + expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "data", [ From a5492ee9ab1459185b0792f3bea6b7acd36d3112 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Mon, 3 Jun 2024 19:33:23 +0100 Subject: [PATCH 46/66] ENH: Restore support for reading Stata 104 format dta files, and add support for 103 (#58555) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/stata.py | 7 ++--- .../tests/io/data/stata/stata-compat-103.dta | Bin 0 -> 650 bytes .../tests/io/data/stata/stata-compat-104.dta | Bin 0 -> 647 bytes .../io/data/stata/stata-compat-be-103.dta | Bin 0 -> 650 bytes .../io/data/stata/stata-compat-be-104.dta | Bin 0 -> 647 bytes pandas/tests/io/data/stata/stata4_103.dta | Bin 0 -> 780 bytes pandas/tests/io/data/stata/stata4_104.dta | Bin 0 -> 770 bytes pandas/tests/io/test_stata.py | 25 +++++++++++++++++- 9 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-104.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-104.dta create mode 100644 pandas/tests/io/data/stata/stata4_103.dta create mode 100644 pandas/tests/io/data/stata/stata4_104.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 872c5b64892e1..e5d65ad82cc95 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,8 +44,8 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b87ec94b85bb0..4e7bd160a5a52 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors if self._format_version >= 111: diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..adfeb6c672333b3223c6b4afebda5598baea7b5b GIT binary patch literal 650 zcmYdiVr1Z8U}hi;axyb>(o#}7GxJhXD?rLKEufk*4b32|Ok*PmBMmCUkOF6vKv~6x z1~4&nTGh}<&mf&a)dEDqDX5?&M9|OxtOQOKqZ=~HCp!cffja;H|Nr~{|N8%D&z#vi zYbFTnvKo$T1 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..0e2ef231f91c00f79e5e8c274e139f325fe4e6f2 GIT binary patch literal 650 zcmYdiWMp9AU|?Wi24cJ*Co?lAEhVKhGcP5z0;Dq20;)dK&;M1Xd*;kp z`<*jEV3xy~POy=QNy#axY3Ui6SwJom)RuidK!qpa3L8MeC<>VwSr{1_7#Q~fO|rji N3@8AlsTzq>L>mP?4i>lpG~Df&Et|gNqCu zp3%2Y|0`Q6`aK%bjQ)O1ZcUyy*4k`C)H*9N?q$0}YHR8`maNZ$c)vs!_siRYXz>uu z-3qpwgygf`SkV@TwRhq`yIV>Vw7(}jM_c`x?6b;b8>^S?%+&c%>gsCOlizB7$u_!9 z4knbo2~n$#3R1^d8Z0Sye6h&jL&2peMRxs{myW`N-)OAc*p9QggkuL6+w;N#b5Y^p^ zbvMYA&o;NhEe3J##9;1jDowcjJ)t?;>epnSRVK@=UbZt+=R>Kht5r{atEtI0x=s!z zl)ed(tBwqk=ds3ZVqGVVN4_NIcHX!k6rF+|kfa1)Ij;M_JS6}N)D1z(z@pRuI5U9d zsD{9a0bd`$ry8{&?n#Zggh1T0nlJ%@xJajDDF*JHr?lAPnzzetLhj`!)4}>Qet;oH z7-NDdy#ntv)dJz*qb@7Wk-|3>2MV*7g4(=EzSeu~$kJw02GY(%f*?`8&t-+}yz4W5 NdS>#HWZVC8#&0|9qDTM$ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d5134a3e3afd0..a27448a342a19 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [105, 108]) + @pytest.mark.parametrize("version", [103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -2011,6 +2011,18 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2020,6 +2032,17 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") From 9f71476fc8f7ace8ee7c24dca3c5ac8e279d7c61 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Jun 2024 08:43:03 -1000 Subject: [PATCH 47/66] CLN: Stopped object inference in constructors for pandas objects (#58758) * CLN: Stopped object inference in constructors for pandas objects * Adjust tests --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_testing/__init__.py | 13 +++-------- pandas/core/construction.py | 4 ++-- pandas/core/frame.py | 16 -------------- pandas/core/indexes/base.py | 22 +++++-------------- pandas/core/internals/construction.py | 8 +++---- pandas/core/series.py | 16 -------------- pandas/tests/copy_view/test_constructors.py | 10 ++++----- pandas/tests/frame/test_constructors.py | 17 +++++--------- .../indexes/base_class/test_constructors.py | 16 +++++--------- pandas/tests/indexes/test_base.py | 17 ++++++-------- .../series/accessors/test_dt_accessor.py | 5 ++--- pandas/tests/series/methods/test_equals.py | 6 ++--- pandas/tests/series/test_constructors.py | 21 ++++++------------ .../tseries/frequencies/test_inference.py | 12 ---------- 15 files changed, 48 insertions(+), 136 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5d65ad82cc95..2707adb06a1d6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -311,6 +311,7 @@ Removal of prior version deprecations/changes - Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`) - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) - Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) +- Stopped performing dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when given a pandas object (:class:`Series`, :class:`Index`, :class:`ExtensionArray`), call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) - Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 12395b42bba19..a757ef6fc1a29 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -10,7 +10,6 @@ ContextManager, cast, ) -import warnings import numpy as np @@ -290,17 +289,11 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Index(expected) + expected = Index(expected) elif box_cls is Series: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected) + expected = Series(expected) elif box_cls is DataFrame: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f01d8822241c9..360e1d5ddd3ff 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -554,7 +554,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - data_was_index = isinstance(data, ABCIndex) + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -607,7 +607,7 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object and not data_was_index: + if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97a4e414608b8..703fece35b23a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -728,10 +728,6 @@ def __init__( NDFrame.__init__(self, data) return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -896,18 +892,6 @@ def __init__( NDFrame.__init__(self, mgr) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtypes.iloc[0] != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The DataFrame " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - # ---------------------------------------------------------------------- def __dataframe__( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 56030a15dc143..15c318e5e9caf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -490,8 +490,6 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references - is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) - # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -508,7 +506,7 @@ def __new__( elif is_ea_or_datetimelike_dtype(data_dtype): pass - elif isinstance(data, (np.ndarray, Index, ABCSeries)): + elif isinstance(data, (np.ndarray, ABCMultiIndex)): if isinstance(data, ABCMultiIndex): data = data._values @@ -518,7 +516,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - + elif isinstance(data, (ABCSeries, Index)): + # GH 56244: Avoid potential inference on object types + pass elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): @@ -571,19 +571,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - result = klass._simple_new(arr, name, refs=refs) - if dtype is None and is_pandas_object and data_dtype == np.object_: - if result.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Index " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - return result # type: ignore[return-value] + return klass._simple_new(arr, name, refs=refs) @classmethod def _ensure_array(cls, data, dtype, copy: bool): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cea52bf8c91b2..23572975a1112 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -192,6 +192,7 @@ def ndarray_to_mgr( ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray + infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray)) if isinstance(values, ABCSeries): if columns is None: @@ -287,15 +288,14 @@ def ndarray_to_mgr( # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): + if dtype is None and infer_object and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ - new_block_2d(dvals_list[n], placement=BlockPlacement(n)) - for n in range(len(dvals_list)) + new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n)) + for n, dval in enumerate(maybe_datetime) ] else: bp = BlockPlacement(slice(len(columns))) diff --git a/pandas/core/series.py b/pandas/core/series.py index f67c0753fa9df..bfaba866c3dfd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -389,10 +389,6 @@ def __init__( self.name = name return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False: if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -438,7 +434,6 @@ def __init__( data = data.astype(dtype) refs = data._references - data = data._values copy = False elif isinstance(data, np.ndarray): @@ -512,17 +507,6 @@ def __init__( self.name = name self._set_axis(0, index) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Series " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - def _init_dict( self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index bc931b53b37d0..eb5177e393936 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -228,12 +228,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series): assert df._mgr._has_no_reference(0) -def test_dataframe_from_series_infer_datetime(): +def test_dataframe_from_series_dont_infer_datetime(): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - df = DataFrame(ser) - assert not np.shares_memory(get_array(ser), get_array(df, 0)) - assert df._mgr._has_no_reference(0) + df = DataFrame(ser) + assert df.dtypes.iloc[0] == np.dtype(object) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + assert not df._mgr._has_no_reference(0) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cbd969e5d90bf..5032932256488 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2702,21 +2702,14 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(idx, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": idx}) + obj = klass([Timestamp("2019-12-31")], dtype=object) + result = DataFrame(obj, columns=["a"]) assert result.dtypes.iloc[0] == np.object_ - ser = Series([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(ser, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": ser}) + result = DataFrame({"a": obj}) assert result.dtypes.iloc[0] == np.object_ def test_dict_keys_returns_rangeindex(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index e5956f808286d..6036eddce7a01 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -59,18 +59,12 @@ def test_index_string_inference(self): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([pd.Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(idx) - assert result.dtype != np.object_ - - ser = Series([pd.Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(ser) - assert result.dtype != np.object_ + obj = klass([pd.Timestamp("2019-12-31")], dtype=object) + result = Index(obj) + assert result.dtype == np.object_ def test_constructor_not_read_only(self): # GH#57130 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bd38e6c2ff333..e701a49ea93ad 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,16 +104,9 @@ def test_constructor_copy(self, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(index.astype(object)) - else: - result = Index(index) - - tm.assert_index_equal(result, index) - - if isinstance(index, DatetimeIndex): - assert result.tz == index.tz - if cast_as_obj: + result = Index(index.astype(object)) + assert result.dtype == np.dtype(object) + if isinstance(index, DatetimeIndex): # GH#23524 check that Index(dti, dtype=object) does not # incorrectly raise ValueError, and that nanoseconds are not # dropped @@ -121,6 +114,10 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) + else: + result = Index(index) + + tm.assert_index_equal(result, index) @pytest.mark.parametrize( "index,has_tz", diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 8c60f7beb317d..49ae0a60e6608 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -256,9 +256,8 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - ser = Series(idx) + idx = period_range("20130101", periods=5, freq="D", name="xxx") + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 875ffdd3fe851..b94723b7cbddf 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,15 +82,13 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 00c614cf72c20..44a7862c21273 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - expected = Series(pi.astype(object)) - tm.assert_series_equal(s, expected) + expected = Series(pi.astype(object)) + assert expected.dtype == object def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -2137,20 +2136,14 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - ser = Series([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(None): - # This doesn't do inference - result = Series(ser) + obj = klass([Timestamp("2019-12-31")], dtype=object) + # This doesn't do inference + result = Series(obj) assert result.dtype == np.object_ - idx = Index([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Series(idx) - assert result.dtype != np.object_ - class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index edfc1973a2bd9..dad5c73b89626 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,7 +23,6 @@ date_range, period_range, ) -import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -@pytest.mark.parametrize( - "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] -) -def test_infer_freq_index(freq, expected): - rng = period_range("1959Q2", "2009Q3", freq=freq) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - rng = Index(rng.to_timestamp("D", how="e").astype(object)) - - assert rng.inferred_freq == expected - - @pytest.mark.parametrize( "expected,dates", list( From 199bf2084a6e755e15fdf59ea97341c13ed10f69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Jun 2024 08:44:57 -1000 Subject: [PATCH 48/66] REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage (#58804) * REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage * Add back arrays * Whitespace --- pandas/_testing/__init__.py | 4 +-- pandas/core/frame.py | 17 ++++----- pandas/core/generic.py | 6 ++-- pandas/core/indexing.py | 4 +-- pandas/core/internals/managers.py | 4 ++- pandas/tests/apply/test_str.py | 2 +- pandas/tests/extension/base/casting.py | 5 +-- pandas/tests/extension/base/constructors.py | 4 +-- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/base/reshaping.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 4 +-- pandas/tests/frame/methods/test_cov_corr.py | 2 +- pandas/tests/frame/methods/test_fillna.py | 2 +- pandas/tests/frame/methods/test_shift.py | 6 ++-- pandas/tests/frame/methods/test_values.py | 2 +- pandas/tests/frame/test_constructors.py | 36 +++++++++---------- pandas/tests/groupby/aggregate/test_cython.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 25 +++++++------ pandas/tests/reshape/merge/test_merge.py | 4 +-- pandas/tests/series/indexing/test_setitem.py | 4 +-- 21 files changed, 74 insertions(+), 65 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index a757ef6fc1a29..d35242ada21e9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -531,8 +531,8 @@ def shares_memory(left, right) -> bool: left._mask, right._mask ) - if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1: - arr = left._mgr.arrays[0] + if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1: + arr = left._mgr.blocks[0].values return shares_memory(arr, right) raise NotImplementedError(type(left), type(right)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 703fece35b23a..c37dfa225de5a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1046,7 +1046,7 @@ def _is_homogeneous_type(self) -> bool: False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + return len({block.values.dtype for block in self._mgr.blocks}) <= 1 @property def _can_fast_transpose(self) -> bool: @@ -5726,7 +5726,6 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) - arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5752,12 +5751,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(arrays) > 1 or ( + elif len(self._mgr.blocks) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(arrays[0], fill_value) + not can_hold_element(self._mgr.blocks[0].values, fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -11453,7 +11452,7 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) if isinstance(dtype, ExtensionDtype): df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) @@ -11478,7 +11477,9 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type( + [block.values.dtype for block in df._mgr.blocks] + ) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11552,8 +11553,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) + for blocks in self._mgr.blocks: + middle = func(blocks.values, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22eecdc95934f..80314c2648f45 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6373,7 +6373,7 @@ def astype( # TODO(EA2D): special case not needed with 2D EAs dtype = pandas_dtype(dtype) if isinstance(dtype, ExtensionDtype) and all( - arr.dtype == dtype for arr in self._mgr.arrays + block.values.dtype == dtype for block in self._mgr.blocks ): return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype @@ -11148,9 +11148,9 @@ def _logical_func( if ( self.ndim > 1 and axis == 1 - and len(self._mgr.arrays) > 1 + and len(self._mgr.blocks) > 1 # TODO(EA2D): special-case not needed - and all(x.ndim == 2 for x in self._mgr.arrays) + and all(block.values.ndim == 2 for block in self._mgr.blocks) and not kwargs ): # Fastpath avoiding potentially expensive transpose diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e9bd3b389dd75..9140b1dbe9b33 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1804,10 +1804,10 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] + arr = self.obj._mgr.blocks[0].values take_split_path = not can_hold_element( arr, extract_array(val, extract_numpy=True) ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c1bcbec1d3f2..82b88d090f847 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -353,6 +353,8 @@ def arrays(self) -> list[ArrayLike]: Warning! The returned arrays don't handle Copy-on-Write, so this should be used with caution (only in read-mode). """ + # TODO: Deprecate, usage in Dask + # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312 return [blk.values for blk in self.blocks] def __repr__(self) -> str: @@ -2068,7 +2070,7 @@ def array(self) -> ArrayLike: """ Quick access to the backing array of the Block. """ - return self.arrays[0] + return self.blocks[0].values # error: Cannot override writeable attribute with read-only property @property diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e0c5e337fb746..e224b07a1097b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -287,7 +287,7 @@ def test_transform_groupby_kernel_frame(request, float_frame, op): # same thing, but ensuring we have multiple blocks assert "E" not in float_frame.columns float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 + assert len(float_frame._mgr.blocks) > 1 ones = np.ones(float_frame.shape[0]) gb2 = float_frame.groupby(ones) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..e924e38ee5030 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data): blk = result._mgr.blocks[0] assert isinstance(blk, NumpyBlock), type(blk) assert blk.is_object - assert isinstance(result._mgr.arrays[0], np.ndarray) - assert result._mgr.arrays[0].dtype == np.dtype(object) + arr = result._mgr.blocks[0].values + assert isinstance(arr, np.ndarray) + assert arr.dtype == np.dtype(object) # check that we can compare the dtypes comp = result.dtypes == df.dtypes diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c32a6a6a115ac..639dc874c9fb9 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -69,7 +69,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) @@ -77,7 +77,7 @@ def test_dataframe_from_series(self, data): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_series_given_mismatched_index_raises(self, data): msg = r"Length of values \(3\) does not match length of index \(5\)" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 935edce32a0ab..3fa2f50bf4930 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -450,7 +450,7 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 - assert res._mgr.arrays[0].ndim == 1 + assert res._mgr.blocks[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 489cd15644d04..24be94443c5ba 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -29,7 +29,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index fe477f88c81ff..15cdc6566b570 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -340,8 +340,8 @@ def test_setitem_dt64tz(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) # Note: This does not hold with Copy on Write (because of lazy copying) - v1 = df._mgr.arrays[1] - v2 = df._mgr.arrays[2] + v1 = df._mgr.blocks[1].values + v2 = df._mgr.blocks[2].values tm.assert_extension_array_equal(v1, v2) v1base = v1._ndarray.base v2base = v2._ndarray.base diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 4151a1d27d06a..aeaf80f285f9d 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -214,7 +214,7 @@ def test_corr_item_cache(self): df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + assert len(df._mgr.blocks) == 2 _ = df.corr(numeric_only=True) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 2ef7780e9a6d5..1b852343266aa 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,7 +47,7 @@ def test_fillna_on_column_view(self): assert np.isnan(arr[:, 0]).all() # i.e. we didn't create a new 49-column block - assert len(df._mgr.arrays) == 1 + assert len(df._mgr.blocks) == 1 assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 72c1a123eac98..4e490e9e344ba 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -320,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series): def get_cat_values(ndframe): # For Series we could just do ._values; for DataFrame # we may be able to do this if we ever have 2D Categoricals - return ndframe._mgr.arrays[0] + return ndframe._mgr.blocks[0].values cat = get_cat_values(obj) @@ -560,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self): # same thing but not consolidated; pre-2.0 we got different behavior df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(1, axis=1, fill_value=0) tm.assert_frame_equal(result, expected) @@ -621,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # same thing but not consolidated df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-1, axis=1, fill_value="foo") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index dfece3fc7552b..2de2053bb705f 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -256,7 +256,7 @@ def test_private_values_dt64_multiblock(self): df = DataFrame({"A": dta[:4]}, copy=False) df["B"] = dta[4:] - assert len(df._mgr.arrays) == 2 + assert len(df._mgr.blocks) == 2 result = df._values expected = dta.reshape(2, 4).T diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 5032932256488..da0504458cf5d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -180,24 +180,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series): arr = arr[:, 0] obj = frame_or_series(arr, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) # go through a different path in internals.construction obj = frame_or_series(frame_or_series(arr), dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object)) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) if frame_or_series is DataFrame: # other paths through internals.construction sers = [Series(x) for x in arr] obj = frame_or_series(sers, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) def test_series_with_name_not_matching_column(self): # GH#9232 @@ -297,7 +297,7 @@ def test_constructor_dtype_nocast_view_dataframe(self): def test_constructor_dtype_nocast_view_2d_array(self): df = DataFrame([[1, 2], [3, 4]], dtype="int64") df2 = DataFrame(df.values, dtype=df[0].dtype) - assert df2._mgr.arrays[0].flags.c_contiguous + assert df2._mgr.blocks[0].values.flags.c_contiguous @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): @@ -2493,9 +2493,9 @@ def get_base(obj): def check_views(c_only: bool = False): # Check that the underlying data behind df["c"] is still `c` # after setting with iloc. Since we don't know which entry in - # df._mgr.arrays corresponds to df["c"], we just check that exactly + # df._mgr.blocks corresponds to df["c"], we just check that exactly # one of these arrays is `c`. GH#38939 - assert sum(x is c for x in df._mgr.arrays) == 1 + assert sum(x.values is c for x in df._mgr.blocks) == 1 if c_only: # If we ever stop consolidating in setitem_with_indexer, # this will become unnecessary. @@ -2503,17 +2503,17 @@ def check_views(c_only: bool = False): assert ( sum( - get_base(x) is a - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is a + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) assert ( sum( - get_base(x) is b - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is b + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) @@ -3045,7 +3045,7 @@ def test_construction_from_ndarray_datetimelike(self): # constructed from 2D ndarray arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) df = DataFrame(arr) - assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks) def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): arr = np.random.default_rng(2).standard_normal((10, 2)) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..bf9e82480785c 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -285,7 +285,7 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.arrays[0].flags.writeable = False + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 172aa9878caec..8b90a6c32849d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -114,7 +114,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array if frame_or_series is Series: values = obj.values else: - values = obj._mgr.arrays[0] + values = obj._mgr.blocks[0].values if frame_or_series is Series: obj.iloc[:2] = index_or_series_or_array(arr[2:]) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f86cc0c69d363..550b424371a95 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,7 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +import itertools import numpy as np import pytest @@ -51,35 +52,39 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: - assert arr.base is not None + for block in result._mgr.blocks: + assert block.values.base is not None # These are the same. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: + for block in result._mgr.blocks: + arr = block.values if arr.dtype.kind == "f": - assert arr.base is df._mgr.arrays[0].base + assert arr.base is df._mgr.blocks[0].values.base elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) result = concat([df, df2, df3, df4], axis=1) - for arr in result._mgr.arrays: + for blocks in result._mgr.blocks: + arr = blocks.values if arr.dtype.kind == "f": # this is a view on some array in either df or df4 assert any( - np.shares_memory(arr, other) - for other in df._mgr.arrays + df4._mgr.arrays + np.shares_memory(arr, block.values) + for block in itertools.chain(df._mgr.blocks, df4._mgr.blocks) ) elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: # this is a view on df3 - assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) + assert any( + np.shares_memory(arr, block.values) for block in df3._mgr.blocks + ) def test_concat_with_group_keys(self): # axis=0 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..0a5989e3c82e6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1451,8 +1451,8 @@ def test_merge_readonly(self): ) # make each underlying block array / column array read-only - for arr in data1._mgr.arrays: - arr.flags.writeable = False + for block in data1._mgr.blocks: + block.values.flags.writeable = False data1.merge(data2) # no error diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b94e6b6f0c6c8..69fba8925784e 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -461,9 +461,9 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ser = Series(dti) assert ser._values is not dti assert ser._values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + assert ser._mgr.blocks[0].values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0] is not dti + assert ser._mgr.blocks[0].values is not dti ser[::3] = NaT assert ser[0] is NaT From ff550e69292181341dc4c330f8bf5e04ee0c69cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 13:48:13 -0700 Subject: [PATCH 49/66] [pre-commit.ci] pre-commit autoupdate (#58903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.3 → v0.4.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.3...v0.4.7) - [github.com/codespell-project/codespell: v2.2.6 → v2.3.0](https://github.com/codespell-project/codespell/compare/v2.2.6...v2.3.0) - [github.com/pre-commit/mirrors-clang-format: v18.1.4 → v18.1.5](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.4...v18.1.5) * Add codespell ignores --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +++--- pandas/tests/internals/test_internals.py | 2 +- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5de902866611..bf88500b10524 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.7 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -40,7 +40,7 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -92,7 +92,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.4 + rev: v18.1.5 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 749e2c4a86b55..60ca47b52b373 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1397,7 +1397,7 @@ def test_make_block_no_pandas_array(block_maker): assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # new_block no longer taked dtype keyword + # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype result = block_maker( arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim diff --git a/pyproject.toml b/pyproject.toml index 085c054f8241a..e7d7474134c3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -722,5 +722,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" ignore-regex = 'https://([\w/\.])+' From 76c7274985215c487248fa5640e12a9b32a06e8c Mon Sep 17 00:00:00 2001 From: pedrocariellof <105252210+pedrocariellof@users.noreply.github.com> Date: Mon, 3 Jun 2024 18:27:15 -0300 Subject: [PATCH 50/66] DEPR: DataFrameGroupBy.corrwith (#58732) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/conftest.py | 1 + pandas/core/groupby/generic.py | 9 ++++++ pandas/tests/groupby/test_all_methods.py | 27 +++++++++--------- pandas/tests/groupby/test_apply.py | 9 +++++- pandas/tests/groupby/test_categorical.py | 28 ++++++++++++++++--- pandas/tests/groupby/test_groupby_dropna.py | 18 ++++++++++-- pandas/tests/groupby/test_numeric_only.py | 18 ++++++++++-- pandas/tests/groupby/test_raises.py | 8 ++++++ .../tests/groupby/transform/test_transform.py | 17 +++++++++-- 10 files changed, 111 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2707adb06a1d6..f54f859bd43ff 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -263,6 +263,7 @@ starting with 3.0, so it can be safely removed from your code. Other Deprecations ^^^^^^^^^^^^^^^^^^ +- Deprecated :meth:`.DataFrameGroupby.corrwith` (:issue:`57158`) - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0ab51139528ad..163c3890a7f6d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -150,6 +150,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), + ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0c4f22f736d4a..945b9f9c14c0b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,6 +21,7 @@ Union, cast, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -2726,6 +2728,8 @@ def corrwith( """ Compute pairwise correlation. + .. deprecated:: 3.0.0 + Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the @@ -2785,6 +2789,11 @@ def corrwith( 2 0.755929 NaN 3 0.576557 NaN """ + warnings.warn( + "DataFrameGroupBy.corrwith is deprecated", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "corrwith", other=other, diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index ad35bec70f668..945c3e421a132 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -25,9 +25,12 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): gb = df.groupby(["a", "b", "c"], group_keys=False) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args).index expected = df.index @@ -42,18 +45,12 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = getattr(gb, groupby_func)(*args) + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = getattr(expected_gb, groupby_func)(*expected_args) + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) @@ -74,8 +71,12 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = getattr(grp_by, groupby_func)(*args) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ac853746cf008..75801b9e039f6 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1197,7 +1197,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) - _ = getattr(grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + _ = getattr(grp, reduction_func)(*args) result = grp.apply(np.sum, axis=0, include_groups=False) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2194e5692aa0e..010bd9ee52555 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1473,7 +1473,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = get_groupby_method_args(reduction_func, df) - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) for cat in unobserved_cats: assert cat not in res.index @@ -1512,7 +1519,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( getattr(df_grp, reduction_func)(*args) return - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1904,8 +1918,14 @@ def test_category_order_reducer( ): getattr(gb, reduction_func)(*args) return - - op_result = getattr(gb, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d3b3c945e06de..4749e845a0e59 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -543,7 +543,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki return gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) - expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": expected["x2"] = expected["x2"].cat.remove_categories([4]) @@ -567,7 +574,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - result = getattr(gb_keepna, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 33cdd1883e1b9..afbc64429e93c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -256,7 +256,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result - result = method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -296,7 +303,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): - method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9301f8d56d9d2..5a8192a9ffe02 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -183,6 +183,8 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -288,6 +290,8 @@ def test_groupby_raises_datetime( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -485,6 +489,8 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -658,6 +664,8 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d6d545a8c4834..726c57081373c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1104,7 +1104,14 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): return args = get_groupby_method_args(reduction_func, obj) - result = g.transform(func, *args) + if func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) @@ -1468,8 +1475,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - warn = FutureWarning if groupby_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = gb_as_index_true.transform(groupby_func, *args) with tm.assert_produces_warning(warn, match=msg): From 6c321bba6771c815f7503954fe7c69921830a76e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 4 Jun 2024 09:44:18 -0700 Subject: [PATCH 51/66] DEPR: make_block (#57754) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/internals/api.py | 10 ++++++++ pandas/io/feather_format.py | 14 ++++++++--- pandas/io/parquet.py | 23 ++++++++++++++--- pandas/io/parsers/arrow_parser_wrapper.py | 30 ++++++++++++++--------- pandas/tests/internals/test_api.py | 5 +++- pandas/tests/internals/test_internals.py | 20 ++++++++++----- pandas/tests/io/test_parquet.py | 1 + 8 files changed, 79 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f54f859bd43ff..2d4fafc3c4e1e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -263,6 +263,7 @@ starting with 3.0, so it can be safely removed from your code. Other Deprecations ^^^^^^^^^^^^^^^^^^ +- Deprecated :func:`core.internals.api.make_block`, use public APIs instead (:issue:`56815`) - Deprecated :meth:`.DataFrameGroupby.corrwith` (:issue:`57158`) - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) @@ -272,7 +273,6 @@ Other Deprecations - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.prior_deprecations: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 24bfad4791b29..04944db2ebd9c 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,6 +10,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np @@ -87,6 +88,15 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#56815 + "make_block is deprecated and will be removed in a future version. " + "Use pd.api.internals.create_dataframe_from_blocks or " + "(recommended) higher-level public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 8132167fbe05c..16d4e1f9ea25d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, ) +import warnings from pandas._config import using_pyarrow_string_dtype @@ -131,9 +132,16 @@ def read_feather( path, "rb", storage_options=storage_options, is_text=False ) as handles: if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 08983ceed44e5..306b144811898 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,7 +10,10 @@ Any, Literal, ) -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) from pandas._config import using_pyarrow_string_dtype @@ -271,7 +274,13 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = pa_table.to_pandas(**to_pandas_kwargs) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -384,7 +393,15 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + return parquet_file.to_pandas( + columns=columns, filters=filters, **kwargs + ) finally: if handles is not None: handles.close() diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8b6f7d5750ffe..cffdb28e2c9e4 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -287,17 +287,23 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + if dtype_backend == "pyarrow": + frame = table.to_pandas(types_mapper=pd.ArrowDtype) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - frame = table.to_pandas() + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index c189d5248b1f3..591157bbe87fe 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -44,7 +44,10 @@ def test_namespace(): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - blk = api.make_block(dti, placement=[0]) + + msg = "make_block is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 60ca47b52b373..fca1ed39c0f9c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1368,8 +1368,10 @@ def test_validate_ndim(): placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1384,8 +1386,12 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) + depr_msg = "make_block is deprecated" + warn = DeprecationWarning if block_maker is make_block else None + # NumpyExtensionArray, no dtype - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1393,14 +1399,16 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 35275f3c23bef..af492b967bc1d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -985,6 +985,7 @@ def test_filter_row_groups(self, pa): result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 + @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow From cc18753374077fcdc1562e1e1da7e620392b42c5 Mon Sep 17 00:00:00 2001 From: Robin Shindelman <129821483+shindelr@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:49:40 -0700 Subject: [PATCH 52/66] Proofreading/editing Getting Started for readability. Attempt 2, this time without extra files. (#58919) Proofreading/editing Getting Started for readability. Attempt2 --- doc/source/getting_started/index.rst | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d9cb1de14aded..9f29f7f4f4406 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -134,8 +134,8 @@ to explore, clean, and process your data. In pandas, a data table is called a :c
-pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these -data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). The ability to import data from each of these +data sources is provided by functions with the prefix, ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg :align: center @@ -181,7 +181,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
-Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the +Selecting or filtering specific rows and/or columns? Filtering the data on a particular condition? Methods for slicing, selecting, and extracting the data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg @@ -228,7 +228,7 @@ data you need are available in pandas.
-pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting for your data right out of the box with the power of Matplotlib. Simply pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -275,7 +275,7 @@ corresponding to your data.
-There is no need to loop over all rows of your data table to do calculations. Data manipulations on a column work elementwise. +There's no need to loop over all rows of your data table to do calculations. Column data manipulations work elementwise in pandas. Adding a column to a :class:`DataFrame` based on existing data in other columns is straightforward. .. image:: ../_static/schemas/05_newcolumn_2.svg @@ -322,7 +322,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
-Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire +Basic statistics (mean, median, min, max, counts...) are easily calculable across data frames. These, or even custom aggregations, can be applied on the entire data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg @@ -369,8 +369,8 @@ data set, a sliding window of the data, or grouped by categories. The latter is
-Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot` -from long to wide format. With aggregations built-in, a pivot table is created with a single command. +Change the structure of your data table in a variety of ways. You can use :func:`~pandas.melt` to reshape your data from a wide format to a long and tidy one. Use :func:`~pandas.pivot` + to go from long to wide format. With aggregations built-in, a pivot table can be created with a single command. .. image:: ../_static/schemas/07_melt.svg :align: center @@ -416,7 +416,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated column wise or row wise with pandas' database-like join and merge operations. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -505,7 +505,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. +Data sets often contain more than just numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -551,9 +551,9 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `R programming language `__ provides the - ``data.frame`` data structure and multiple packages, such as - `tidyverse `__ use and extend ``data.frame`` + The `R programming language `__ provides a + ``data.frame`` data structure as well as packages like + `tidyverse `__ which use and extend ``data.frame`` for convenient data handling functionalities similar to pandas. +++ @@ -572,8 +572,8 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? - Most of these SQL manipulations do have equivalents in pandas. + Already familiar with ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Many SQL manipulations have equivalents in pandas. +++ @@ -631,10 +631,10 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `SAS `__ statistical software suite - also provides the ``data set`` corresponding to the pandas ``DataFrame``. - Also SAS vectorized operations, filtering, string processing operations, - and more have similar functions in pandas. + `SAS `__, the statistical software suite, + uses the ``data set`` structure, which closely corresponds pandas' ``DataFrame``. + Also SAS vectorized operations such as filtering or string processing operations + have similar functions in pandas. +++ From dcb542bfb0befec5d385604cb0175fb7ac45572e Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:34:37 +0530 Subject: [PATCH 53/66] DOC: fix RT03,SA01 for pandas.bdate_range (#58917) --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1e615b6df8446..d29761b938eaa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -343,7 +343,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.bdate_range RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 930bc7a95bd14..c276750314a34 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1070,6 +1070,13 @@ def bdate_range( Returns ------- DatetimeIndex + Fixed frequency DatetimeIndex. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex. Notes ----- From f55bec1db9620dd3e2136c2b45b05d7391a94015 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:35:38 +0530 Subject: [PATCH 54/66] DOC: fix SA01 for pandas.describe_option (#58914) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d29761b938eaa..8145cb62070ca 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -403,7 +403,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.window.rolling.Window.std PR01" \ -i "pandas.core.window.rolling.Window.var PR01" \ -i "pandas.date_range RT03" \ - -i "pandas.describe_option SA01" \ -i "pandas.errors.AbstractMethodError PR01,SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 55d9e29686259..95c549a8ff0e8 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -279,6 +279,12 @@ def describe_option(pat: str = "", _print_desc: bool = True) -> str | None: str If the description(s) as a string if ``_print_desc=False``. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + Notes ----- For all available options, please view the From da5408750750ad30f270ae1295f1746efa5d9c80 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:36:56 +0530 Subject: [PATCH 55/66] DOC: fix PR07 for pandas.pivot_table (#58896) * DOC: fix PR07 for pandas.pivot_table * DOC: remove redundant comments Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/reshape/pivot.py | 169 ++++++++++++++++++++++++++++++++++- 2 files changed, 165 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8145cb62070ca..039700f306e03 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -470,7 +470,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.pivot PR07" \ - -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e0126d439a79c..86da19f13bacf 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -54,10 +54,6 @@ from pandas import DataFrame -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot_table"], indents=1) def pivot_table( data: DataFrame, values=None, @@ -71,6 +67,171 @@ def pivot_table( observed: bool = True, sort: bool = True, ) -> DataFrame: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or list of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or list of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margin=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + rows with a NaN value in any column will be omitted before + computing margins. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + ... "C": [ + ... "small", + ... "large", + ... "large", + ... "small", + ... "small", + ... "large", + ... "small", + ... "small", + ... "large", + ... ], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + ... } + ... ) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum" + ... ) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum", fill_value=0 + ... ) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table( + ... df, values=["D", "E"], index=["A", "C"], aggfunc={"D": "mean", "E": "mean"} + ... ) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table( + ... df, + ... values=["D", "E"], + ... index=["A", "C"], + ... aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, + ... ) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ index = _convert_by(index) columns = _convert_by(columns) From f97c3c4022fcdc884d550420ce7442a52ed00a89 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:08:08 -1000 Subject: [PATCH 56/66] CI: Update CircleCI configs (#58908) * CI: Update CircleCI configs * Fix workflow * add mamba path * Use conda env create * Remove workaround? --- .circleci/config.yml | 72 ++++++++++++------- .circleci/setup_env.sh | 60 ---------------- ...e-310-arm64.yaml => circle-311-arm64.yaml} | 2 +- 3 files changed, 46 insertions(+), 88 deletions(-) delete mode 100755 .circleci/setup_env.sh rename ci/deps/{circle-310-arm64.yaml => circle-311-arm64.yaml} (98%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6f134c9a7a7bd..463667446ed42 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,46 +1,64 @@ version: 2.1 jobs: - test-arm: + test-linux-arm: machine: image: default resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml + ENV_FILE: ci/deps/circle-311-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" PANDAS_CI: "1" steps: - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: + - run: + name: Install Environment and Run Tests + shell: /bin/bash -exuo pipefail + command: | + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" + wget -q $MAMBA_URL -O minimamba.sh + chmod +x minimamba.sh + MAMBA_DIR="$HOME/miniconda3" + rm -rf $MAMBA_DIR + ./minimamba.sh -b -p $MAMBA_DIR + export PATH=$MAMBA_DIR/bin:$PATH + conda info -a + conda env create -q -n pandas-dev -f $ENV_FILE + conda list -n pandas-dev + source activate pandas-dev + if pip show pandas 1>/dev/null; then + pip uninstall -y pandas + fi + python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + ci/run_tests.sh + test-linux-musl: docker: - image: quay.io/pypa/musllinux_1_1_aarch64 resource_class: arm.large steps: # Install pkgs first to have git in the image # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales + - run: + name: Install System Packages + command: | + apk update + apk add git + apk add musl-locales - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + - run: + name: Install Environment and Run Tests + command: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: @@ -71,7 +89,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.15.0 + pip3 install cibuildwheel==2.18.1 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: @@ -81,7 +99,7 @@ jobs: name: Install Anaconda Client & Upload Wheels command: | echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" echo "Downloading $MAMBA_URL" wget -q $MAMBA_URL -O minimamba.sh chmod +x minimamba.sh @@ -107,14 +125,14 @@ workflows: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - test-arm + - test-linux-arm test-musl: # Don't run trigger this one when scheduled pipeline runs when: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - linux-musl + - test-linux-musl build-wheels: jobs: - build-aarch64: diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index eef4db1191a9a..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -e - -echo "Install Mambaforge" -MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" -echo "Downloading $MAMBA_URL" -wget -q $MAMBA_URL -O minimamba.sh -chmod +x minimamba.sh - -MAMBA_DIR="$HOME/miniconda3" -rm -rf $MAMBA_DIR -./minimamba.sh -b -p $MAMBA_DIR - -export PATH=$MAMBA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -mamba install -y -c conda-forge -n base pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip show pandas 1>/dev/null; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas -fi - -echo "Install pandas" -python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" - -echo "done" diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-311-arm64.yaml similarity index 98% rename from ci/deps/circle-310-arm64.yaml rename to ci/deps/circle-311-arm64.yaml index ed4d139714e71..1c31d353699f8 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.11 # build dependencies - versioneer[toml] From ea509bbe5c269e925ece5712986077318df85f0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:08:36 -1000 Subject: [PATCH 57/66] CLN: Plotting tests (#58905) * Clean some plotting tests * Cleann test_misc * Format test_datetimelike * clean more datetimelike --- pandas/tests/plotting/test_datetimelike.py | 195 +++++++++------------ pandas/tests/plotting/test_misc.py | 132 +++++--------- pandas/tests/plotting/test_series.py | 94 +++------- pandas/tests/plotting/test_style.py | 18 +- 4 files changed, 154 insertions(+), 285 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 4b4eeada58366..a9135ee583d91 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -46,6 +46,8 @@ mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +import pandas.plotting._matplotlib.converter as conv + class TestTSPlot: @pytest.mark.filterwarnings("ignore::UserWarning") @@ -73,7 +75,7 @@ def test_fontsize_set_correctly(self): def test_frame_inferred(self): # inferred freq - idx = date_range("1/1/1987", freq="MS", periods=100) + idx = date_range("1/1/1987", freq="MS", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( @@ -82,7 +84,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40].union(idx[45:99]) + idx = idx[0:4].union(idx[6:]) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -111,7 +113,6 @@ def test_nonnumeric_exclude(self): fig, ax = mpl.pyplot.subplots() df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): idx = date_range("1/1/1987", freq="YE", periods=3) @@ -122,7 +123,7 @@ def test_nonnumeric_exclude_error(self): @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -131,7 +132,7 @@ def test_tsplot_period(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -145,10 +146,9 @@ def test_tsplot(self): color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() - def test_both_style_and_color(self): - ts = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)]) + def test_both_style_and_color(self, index): + ts = Series(np.arange(10, dtype=np.float64), index=index) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -157,46 +157,37 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") - s = ts.reset_index(drop=True) - with pytest.raises(ValueError, match=msg): - s.plot(style="b-", color="#000099") - @pytest.mark.parametrize("freq", ["ms", "us"]) def test_high_freq(self, freq): _, ax = mpl.pyplot.subplots() - rng = date_range("1/1/2012", periods=100, freq=freq) + rng = date_range("1/1/2012", periods=10, freq=freq) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): - from pandas.plotting._matplotlib.converter import get_datevalue - - assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "Y") == 1987 - assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal - assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - - def test_ts_plot_format_coord(self): - def check_format_of_first_point(ax, expected_string): - first_line = ax.get_lines()[0] - first_x = first_line.get_xdata()[0].ordinal - first_y = first_line.get_ydata()[0] - assert expected_string == ax.format_coord(first_x, first_y) + assert conv.get_datevalue(None, "D") is None + assert conv.get_datevalue(1987, "Y") == 1987 + assert ( + conv.get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal + ) + assert conv.get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) + @pytest.mark.parametrize( + "freq, expected_string", + [["YE-DEC", "t = 2014 y = 1.000000"], ["D", "t = 2014-01-01 y = 1.000000"]], + ) + def test_ts_plot_format_coord(self, freq, expected_string): + ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq)) _, ax = mpl.pyplot.subplots() - annual.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - - # note this is added to the annual plot already in existence, and - # changes its freq field - daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) - daily.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + ser.plot(ax=ax) + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + assert expected_string == ax.format_coord(first_x, first_y) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @@ -206,7 +197,7 @@ def test_line_plot_period_series(self, freq): def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -214,13 +205,13 @@ def test_line_plot_period_mlt_series(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -235,7 +226,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -249,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -263,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) ser = Series(ser.values, Index(np.asarray(ser.index))) _check_plot_works(ser.plot, ser.index.inferred_freq) @@ -350,8 +341,8 @@ def test_business_freq(self): def test_business_freq_convert(self): bts = Series( - np.arange(300, dtype=np.float64), - index=date_range("2020-01-01", periods=300, freq="B"), + np.arange(50, dtype=np.float64), + index=date_range("2020-01-01", periods=50, freq="B"), ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() @@ -444,12 +435,8 @@ def test_axis_limits(self, obj): result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal - fig = ax.get_figure() - mpl.pyplot.close(fig) def test_get_finder(self): - import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder @@ -552,7 +539,7 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): - nminutes = 50 * 24 * 60 + nminutes = 1 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -577,9 +564,9 @@ def test_finder_hourly(self): def test_gaps(self): ts = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts.iloc[5:25] = np.nan + ts.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -591,8 +578,7 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() - mpl.pyplot.close(ax.get_figure()) + assert mask[5:7, 1].all() def test_gaps_irregular(self): # irregular @@ -613,7 +599,6 @@ def test_gaps_irregular(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - mpl.pyplot.close(ax.get_figure()) def test_gaps_non_ts(self): # non-ts @@ -634,9 +619,9 @@ def test_gaps_non_ts(self): def test_gap_upsample(self): low = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - low.iloc[5:25] = np.nan + low.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -653,7 +638,7 @@ def test_gap_upsample(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() + assert mask[5:7, 1].all() def test_secondary_y(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -667,7 +652,6 @@ def test_secondary_y(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_yaxis(self): Series(np.random.default_rng(2).standard_normal(10)) @@ -675,7 +659,6 @@ def test_secondary_y_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_both(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -701,7 +684,6 @@ def test_secondary_y_ts(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_ts_yaxis(self): idx = date_range("1/1/2000", periods=10) @@ -709,7 +691,6 @@ def test_secondary_y_ts_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_y_ts_visible(self): idx = date_range("1/1/2000", periods=10) @@ -1108,8 +1089,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1122,8 +1103,8 @@ def test_mixed_freq_second_millisecond(self): def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1298,7 +1279,6 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_right(self): df = DataFrame( @@ -1315,7 +1295,6 @@ def test_secondary_legend_right(self): assert leg.get_texts()[1].get_text() == "B" assert leg.get_texts()[2].get_text() == "C" assert leg.get_texts()[3].get_text() == "D" - mpl.pyplot.close(fig) def test_secondary_legend_bar(self): df = DataFrame( @@ -1328,7 +1307,6 @@ def test_secondary_legend_bar(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A (right)" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): df = DataFrame( @@ -1341,7 +1319,6 @@ def test_secondary_legend_bar_right(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): df = DataFrame( @@ -1366,14 +1343,13 @@ def test_secondary_legend_multi_col(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_nonts(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1387,14 +1363,13 @@ def test_secondary_legend_nonts(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close() def test_secondary_legend_nonts_multi_col(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1448,13 +1423,10 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) - exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) def test_irregular_ts_shared_ax_xlim(self): # GH 2960 - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1467,8 +1439,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y @@ -1504,7 +1476,7 @@ def test_secondary_y_regular_ts_xlim(self): def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range("2000-01-01", periods=10000, freq="min") + rng = date_range("2000-01-01", periods=10, freq="min") ts = Series(1, index=rng) _, ax = mpl.pyplot.subplots() @@ -1519,8 +1491,6 @@ def test_secondary_y_mixed_freq_ts_xlim(self): def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1534,8 +1504,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise @@ -1722,35 +1692,28 @@ def test_pickle_fig(self, temp_file, frame_or_series, idx): def _check_plot_works(f, freq=None, series=None, *args, **kwargs): - import matplotlib.pyplot as plt - fig = plt.gcf() - try: - plt.clf() - ax = fig.add_subplot(211) - orig_ax = kwargs.pop("ax", plt.gca()) - orig_axfreq = getattr(orig_ax, "freq", None) - - ret = f(*args, **kwargs) - assert ret is not None # do something more intelligent - - ax = kwargs.pop("ax", plt.gca()) - if series is not None: - dfreq = series.index.freq - if isinstance(dfreq, BaseOffset): - dfreq = dfreq.rule_code - if orig_axfreq is None: - assert ax.freq == dfreq - - if freq is not None: - ax_freq = to_offset(ax.freq, is_period=True) - if freq is not None and orig_axfreq is None: - assert ax_freq == freq - - ax = fig.add_subplot(212) - kwargs["ax"] = ax - ret = f(*args, **kwargs) - assert ret is not None # TODO: do something more intelligent - finally: - plt.close(fig) + fig.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, BaseOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert to_offset(ax.freq, is_period=True) == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index d593ddbbaa0b8..43e1255404784 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -31,6 +31,8 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def iris(datapath) -> DataFrame: @@ -109,8 +111,6 @@ def test_savefig(kind, data, index): class TestSeriesPlots: def test_autocorrelation_plot(self): - from pandas.plotting import autocorrelation_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), @@ -118,32 +118,28 @@ def test_autocorrelation_plot(self): ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(autocorrelation_plot, series=ser) - _check_plot_works(autocorrelation_plot, series=ser.values) + _check_plot_works(plotting.autocorrelation_plot, series=ser) + _check_plot_works(plotting.autocorrelation_plot, series=ser.values) - ax = autocorrelation_plot(ser, label="Test") + ax = plotting.autocorrelation_plot(ser, label="Test") _check_legend_labels(ax, labels=["Test"]) @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}]) def test_lag_plot(self, kwargs): - from pandas.plotting import lag_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(lag_plot, series=ser, **kwargs) + _check_plot_works(plotting.lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): - from pandas.plotting import bootstrap_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(bootstrap_plot, series=ser, size=10) + _check_plot_works(plotting.bootstrap_plot, series=ser, size=10) class TestDataFramePlots: @@ -156,7 +152,7 @@ def test_scatter_matrix_axis(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(2).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning, check_stacklevel=False): @@ -168,7 +164,7 @@ def test_scatter_matrix_axis(self, pass_axis): ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ["-2", "0", "2"] + expected = ["-2", "-1", "0"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -181,7 +177,7 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(11).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(11).standard_normal((10, 3))) df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot @@ -193,18 +189,15 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): ax=ax, ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ["-1.0", "-0.5", "0.0"] + expected = ["-1.25", "-1.0", "-0.75", "-0.5"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves_no_warning(self, iris): - from pandas.plotting import andrews_curves - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(andrews_curves, frame=df, class_column="Name") + _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name") @pytest.mark.slow @pytest.mark.parametrize( @@ -229,12 +222,10 @@ def test_andrews_curves_no_warning(self, iris): ], ) def test_andrews_curves_linecolors(self, request, df, linecolors): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=linecolors + plotting.andrews_curves, frame=df, class_column="Name", color=linecolors ) _check_colors( ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10] @@ -256,23 +247,19 @@ def test_andrews_curves_linecolors(self, request, df, linecolors): ], ) def test_andrews_curves_cmap(self, request, df): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=cmaps + plotting.andrews_curves, frame=df, class_column="Name", color=cmaps ) _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_andrews_curves_handle(self): - from pandas.plotting import andrews_curves - colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = andrews_curves(df, "Name", color=colors) + ax = plotting.andrews_curves(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) @@ -282,61 +269,54 @@ def test_andrews_curves_handle(self): [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_parallel_coordinates_colors(self, iris, color): - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=color + plotting.parallel_coordinates, frame=df, class_column="Name", color=color ) _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + plotting.parallel_coordinates, + frame=df, + class_column="Name", + colormap=cm.jet, ) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_line_diff(self, iris): - from pandas.plotting import parallel_coordinates - df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + ax = _check_plot_works( + plotting.parallel_coordinates, frame=df, class_column="Name" + ) nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", axvlines=False + plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) @pytest.mark.slow def test_parallel_coordinates_handles(self, iris): - from pandas.plotting import parallel_coordinates - df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = parallel_coordinates(df, "Name", color=colors) + ax = plotting.parallel_coordinates(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """For #15908""" - from pandas.plotting import parallel_coordinates - + # GH 15908 df = DataFrame( { "feat": list(range(30)), @@ -345,7 +325,7 @@ def test_parallel_coordinates_with_sorted_labels(self): + [1 for _ in range(10)], } ) - ax = parallel_coordinates(df, "class", sort_labels=True) + ax = plotting.parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() color_label_tuples = zip( [polyline.get_color() for polyline in polylines], labels @@ -359,45 +339,38 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] def test_radviz_no_warning(self, iris): - from pandas.plotting import radviz - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(radviz, frame=df, class_column="Name") + _check_plot_works(plotting.radviz, frame=iris, class_column="Name") @pytest.mark.parametrize( "color", [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_radviz_color(self, iris, color): - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", color=color) + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", color=color + ) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10]) def test_radviz_color_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) def test_radviz_colors_handles(self): - from pandas.plotting import radviz - colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) - ax = radviz(df, "Name", color=colors) + ax = plotting.radviz(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, facecolors=colors) @@ -471,15 +444,11 @@ def test_get_standard_colors_random_seed(self): def test_get_standard_colors_consistency(self): # GH17525 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import get_standard_colors - color1 = get_standard_colors(1, color_type="random") color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import get_standard_colors - # Make sure the default color_types returns the specified amount color1 = get_standard_colors(1, color_type="default") color2 = get_standard_colors(9, color_type="default") @@ -509,11 +478,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. - from matplotlib import cm - - from pandas.plotting._matplotlib.style import get_standard_colors - - color_before = cm.gnuplot(range(5)) + color_before = mpl.cm.gnuplot(range(5)) color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -521,7 +486,7 @@ def test_get_standard_colors_no_appending(self): np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD") ) - color_list = cm.gnuplot(np.linspace(0, 1, 16)) + color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() @@ -546,9 +511,7 @@ def test_dictionary_color(self, kind): def test_bar_plot(self): # GH38947 # Test bar plot with string and int index - from matplotlib.text import Text - - expected = [Text(0, 0, "0"), Text(1, 0, "Total")] + expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")] df = DataFrame( { @@ -565,11 +528,12 @@ def test_bar_plot(self): def test_barh_plot_labels_mixed_integer_string(self): # GH39126 # Test barh plot with string and integer at the same column - from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] + expected_yticklabels = [ + mpl.text.Text(0, 0, "1"), + mpl.text.Text(0, 1, "knowledge"), + ] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( @@ -649,8 +613,8 @@ def test_externally_shared_axes(self): # Create data df = DataFrame( { - "a": np.random.default_rng(2).standard_normal(1000), - "b": np.random.default_rng(2).standard_normal(1000), + "a": np.random.default_rng(2).standard_normal(10), + "b": np.random.default_rng(2).standard_normal(10), } ) @@ -707,9 +671,7 @@ def test_plot_bar_axis_units_timestamp_conversion(self): def test_bar_plt_xaxis_intervalrange(self): # GH 38969 # Ensure IntervalIndex x-axis produces a bar plot as expected - from matplotlib.text import Text - - expected = [Text(0, 0, "([0, 1],)"), Text(1, 0, "([1, 2],)")] + expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")] s = Series( [1, 2], index=[interval_range(0, 2, closed="both")], diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 54f09c7007330..279d9a18d8df7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -33,9 +33,14 @@ get_y_axis, ) +from pandas.tseries.offsets import CustomBusinessDay + mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +from pandas.plotting._matplotlib.converter import DatetimeConverter +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def ts(): @@ -49,7 +54,7 @@ def ts(): @pytest.fixture def series(): return Series( - range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)] ) @@ -192,28 +197,24 @@ def test_area_sharey_dont_overwrite(self, ts): assert get_y_axis(ax1).joined(ax1, ax2) assert get_y_axis(ax2).joined(ax1, ax2) - plt.close(fig) def test_label(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(label="LABEL", legend=True, ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_none(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=[""]) - mpl.pyplot.close("all") def test_label_ser_name(self): s = Series([1, 2], name="NAME") _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=["NAME"]) - mpl.pyplot.close("all") def test_label_ser_name_override(self): s = Series([1, 2], name="NAME") @@ -221,7 +222,6 @@ def test_label_ser_name_override(self): _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, label="LABEL", ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_ser_name_override_dont_draw(self): s = Series([1, 2], name="NAME") @@ -231,7 +231,6 @@ def test_label_ser_name_override_dont_draw(self): assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_boolean(self): # GH 23719 @@ -344,9 +343,7 @@ def test_rotation_30(self): _check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - from pandas.plotting._matplotlib.converter import DatetimeConverter - - rng = date_range("1/1/2000", "3/1/2000") + rng = date_range("1/1/2000", "1/15/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -453,9 +450,9 @@ def test_pie_nan(self): def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # primary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() @@ -467,28 +464,12 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_with_axes(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # primary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left and right axis must be visible - _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) - assert ax.get_yaxis().get_visible() - assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, ax=ax) @@ -500,29 +481,12 @@ def test_df_series_secondary_legend_both(self): assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # secondary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(secondary_y=True, ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left axis must be invisible and right axis must be visible - expected = ["a (right)", "b (right)", "c (right)", "x (right)"] - _check_legend_labels(ax.left_ax, expected) - assert not ax.left_ax.get_yaxis().get_visible() - assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis_2(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (with passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, mark_right=False, ax=ax) @@ -537,17 +501,12 @@ def test_df_series_secondary_legend_both_with_axis_2(self): @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) - def test_secondary_logy(self, input_logy, expected_scale): - # GH 25545 - s1 = Series(np.random.default_rng(2).standard_normal(100)) - s2 = Series(np.random.default_rng(2).standard_normal(100)) - - # GH 24980 - ax1 = s1.plot(logy=input_logy) - ax2 = s2.plot(secondary_y=True, logy=input_logy) - + @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}]) + def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg): + # GH 25545, GH 24980 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + ax1 = s1.plot(logy=input_logy, **secondary_kwarg) assert ax1.get_yscale() == expected_scale - assert ax2.get_yscale() == expected_scale def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.default_rng(2).standard_normal(2)) @@ -673,6 +632,9 @@ def test_errorbar_asymmetrical(self): expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) + def test_errorbar_asymmetrical_error(self): + # GH9536 + s = Series(np.arange(10), name="x") msg = ( "Asymmetrical error bars should be provided " f"with the shape \\(2, {len(s)}\\)" @@ -759,8 +721,6 @@ def test_series_grid_settings(self): @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"]) def test_standard_colors(self, c): - from pandas.plotting._matplotlib.style import get_standard_colors - result = get_standard_colors(1, color=c) assert result == [c] @@ -774,12 +734,8 @@ def test_standard_colors(self, c): assert result == [c] * 3 def test_standard_colors_all(self): - from matplotlib import colors - - from pandas.plotting._matplotlib.style import get_standard_colors - # multiple colors like mediumaquamarine - for c in colors.cnames: + for c in mpl.colors.cnames: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -793,7 +749,7 @@ def test_standard_colors_all(self): assert result == [c] * 3 # single letter colors like k - for c in colors.ColorConverter.colors: + for c in mpl.colors.ColorConverter.colors: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -821,8 +777,6 @@ def test_time_series_plot_color_kwargs(self): _check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - def_colors = _unpack_cycler(mpl.rcParams) index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) @@ -851,8 +805,6 @@ def test_xtick_barPlot(self): def test_custom_business_day_freq(self): # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series( range(100, 121), index=pd.bdate_range( diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 665bda15724fd..f9c89e0a7893f 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -2,7 +2,8 @@ from pandas import Series -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") from pandas.plotting._matplotlib.style import get_standard_colors @@ -18,11 +19,8 @@ class TestGetStandardColors: ], ) def test_default_colors_named_from_prop_cycle(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -39,11 +37,8 @@ def test_default_colors_named_from_prop_cycle(self, num_colors, expected): ], ) def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color="bgry"), + "axes.prop_cycle": plt.cycler(color="bgry"), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -74,11 +69,8 @@ def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected) ], ) def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): - import matplotlib as mpl - import matplotlib.colors as mcolors - with mpl.rc_context(rc={}): - expected = [mcolors.to_hex(x) for x in expected_name] + expected = [mpl.colors.to_hex(x) for x in expected_name] result = get_standard_colors(num_colors=num_colors) assert result == expected From 728cfcbc71f2bc9e3d35c4d5269cd8e66183c46f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:09:09 -1000 Subject: [PATCH 58/66] CLN: Plotting tests 2 (#58910) * Refactor test_converter * clean test_boxplot method * cleann test_hist_method --- pandas/tests/plotting/test_boxplot_method.py | 29 +++----- pandas/tests/plotting/test_converter.py | 28 ++------ pandas/tests/plotting/test_hist_method.py | 71 ++++++++------------ 3 files changed, 44 insertions(+), 84 deletions(-) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 573f95eed15ef..4916963ab7c87 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -38,9 +38,7 @@ def _check_ax_limits(col, ax): class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 - import matplotlib.pyplot as plt - - n = 80 + n = 30 df = DataFrame( { "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n), @@ -51,10 +49,10 @@ def test_stacked_boxplot_set_axis(self): ) ax = df.plot(kind="bar", stacked=True) assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list() - ax.set_xticks(np.arange(0, 80, 10)) + ax.set_xticks(np.arange(0, n, 10)) plt.draw() # Update changes assert [int(x.get_text()) for x in ax.get_xticklabels()] == list( - np.arange(0, 80, 10) + np.arange(0, n, 10) ) @pytest.mark.slow @@ -227,12 +225,12 @@ def test_boxplot_numeric_data(self): # GH 22799 df = DataFrame( { - "a": date_range("2012-01-01", periods=100), - "b": np.random.default_rng(2).standard_normal(100), - "c": np.random.default_rng(2).standard_normal(100) + 2, - "d": date_range("2012-01-01", periods=100).astype(str), - "e": date_range("2012-01-01", periods=100, tz="UTC"), - "f": timedelta_range("1 days", periods=100), + "a": date_range("2012-01-01", periods=10), + "b": np.random.default_rng(2).standard_normal(10), + "c": np.random.default_rng(2).standard_normal(10) + 2, + "d": date_range("2012-01-01", periods=10).astype(str), + "e": date_range("2012-01-01", periods=10, tz="UTC"), + "f": timedelta_range("1 days", periods=10), } ) ax = df.plot(kind="box") @@ -282,8 +280,6 @@ def test_color_kwd(self, colors_kwd, expected): def test_colors_in_theme(self, scheme, expected): # GH: 40769 df = DataFrame(np.random.default_rng(2).random((10, 2))) - import matplotlib.pyplot as plt - plt.style.use(scheme) result = df.plot.box(return_type="dict") for k, v in expected.items(): @@ -334,8 +330,8 @@ def test_plot_xlabel_ylabel(self, vert): def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) - df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) - df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) @@ -344,7 +340,6 @@ def test_plot_box(self, vert): for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): @@ -374,7 +369,6 @@ def test_boxplot_group_xlabel_ylabel(self, vert): for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_group_no_xlabel_ylabel(self, vert): @@ -389,7 +383,6 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): for subplot in ax: target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() assert target_label == pprint_thing(["group"]) - mpl.pyplot.close() class TestDataFrameGroupByPlots: diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index d4774a5cd0439..6a1777b098de0 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -34,15 +34,11 @@ Second, ) -try: - from pandas.plotting._matplotlib import converter -except ImportError: - # try / except, rather than skip, to avoid internal refactoring - # causing an improper skip - pass - -pytest.importorskip("matplotlib.pyplot") +plt = pytest.importorskip("matplotlib.pyplot") dates = pytest.importorskip("matplotlib.dates") +units = pytest.importorskip("matplotlib.units") + +from pandas.plotting._matplotlib import converter @pytest.mark.single_cpu @@ -79,30 +75,22 @@ def test_dont_register_by_default(self): assert subprocess.check_call(call) == 0 def test_registering_no_warning(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() ax.plot(s.index, s.values) - plt.close() def test_pandas_plots_register(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run with tm.assert_produces_warning(None) as w: s.plot() - try: - assert len(w) == 0 - finally: - plt.close() + assert len(w) == 0 def test_matplotlib_formatters(self): - units = pytest.importorskip("matplotlib.units") - # Can't make any assertion about the start state. # We we check that toggling converters off removes it, and toggling it # on restores it. @@ -113,8 +101,6 @@ def test_matplotlib_formatters(self): assert Timestamp in units.registry def test_option_no_warning(self): - pytest.importorskip("matplotlib.pyplot") - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() @@ -126,12 +112,8 @@ def test_option_no_warning(self): register_matplotlib_converters() with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) - plt.close() def test_registry_resets(self): - units = pytest.importorskip("matplotlib.units") - dates = pytest.importorskip("matplotlib.dates") - # make a copy, to reset to original = dict(units.registry) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 511c1dd7761d5..65cb62917dc4e 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -27,6 +27,9 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +from pandas.plotting._matplotlib.hist import _grouped_hist @pytest.fixture @@ -119,18 +122,13 @@ def test_hist_layout_with_by_shape(self, hist_df): _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) def test_hist_no_overlap(self): - from matplotlib.pyplot import ( - gcf, - subplot, - ) - x = Series(np.random.default_rng(2).standard_normal(2)) y = Series(np.random.default_rng(2).standard_normal(2)) - subplot(121) + plt.subplot(121) x.hist() - subplot(122) + plt.subplot(122) y.hist() - fig = gcf() + fig = plt.gcf() axes = fig.axes assert len(axes) == 2 @@ -140,10 +138,8 @@ def test_hist_by_no_extra_plots(self, hist_df): assert len(mpl.pyplot.get_fignums()) == 1 def test_plot_fails_when_ax_differs_from_figure(self, ts): - from pylab import figure - - fig1 = figure() - fig2 = figure() + fig1 = plt.figure(1) + fig2 = plt.figure(2) ax1 = fig1.add_subplot(111) msg = "passed axis not bound to passed figure" with pytest.raises(AssertionError, match=msg): @@ -169,8 +165,8 @@ def test_histtype_argument(self, histtype, expected): ) def test_hist_with_legend(self, by, expected_axes_num, expected_layout): # GH 6279 - Series histogram can have a legend - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" # Use default_axes=True when plotting method generate subplots itself @@ -181,8 +177,8 @@ def test_hist_with_legend(self, by, expected_axes_num, expected_layout): @pytest.mark.parametrize("by", [None, "b"]) def test_hist_with_legend_raises(self, by): # GH 6279 - Series histogram with legend and label raises - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" with pytest.raises(ValueError, match="Cannot use both legend and label"): @@ -331,12 +327,10 @@ def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series): @pytest.mark.slow def test_hist_df_legacy_rectangles(self): - from matplotlib.patches import Rectangle - ser = Series(range(10)) ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) @pytest.mark.slow @@ -431,12 +425,12 @@ def test_hist_layout_error(self): # GH 9351 def test_tight_layout(self): - df = DataFrame(np.random.default_rng(2).standard_normal((100, 2))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) df[2] = to_datetime( np.random.default_rng(2).integers( 812419200000000000, 819331200000000000, - size=100, + size=10, dtype=np.int64, ) ) @@ -504,7 +498,7 @@ def test_hist_column_order_unchanged(self, column, expected): def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame( - np.random.default_rng(2).integers(1, 10, size=(100, 2)), columns=["a", "b"] + np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"] ) ax = df.hist(histtype=histtype) _check_patches_all_filled(ax, filled=expected) @@ -519,9 +513,9 @@ def test_hist_with_legend(self, by, column): if by is not None: expected_labels = [expected_labels] * 2 - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -545,9 +539,9 @@ def test_hist_with_legend(self, by, column): @pytest.mark.parametrize("column", [None, "b"]) def test_hist_with_legend_raises(self, by, column): # GH 6279 - DataFrame histogram with legend and label raises - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -586,7 +580,7 @@ def test_hist_df_with_nonnumerics_no_bins(self): def test_hist_secondary_legend(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # primary -> secondary @@ -602,7 +596,7 @@ def test_hist_secondary_legend(self): def test_hist_secondary_secondary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> secondary _, ax = mpl.pyplot.subplots() @@ -617,7 +611,7 @@ def test_hist_secondary_secondary(self): def test_hist_secondary_primary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> primary _, ax = mpl.pyplot.subplots() @@ -632,7 +626,6 @@ def test_hist_secondary_primary(self): def test_hist_with_nans_and_weights(self): # GH 48884 - mpl_patches = pytest.importorskip("matplotlib.patches") df = DataFrame( [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], columns=list("abc"), @@ -643,12 +636,12 @@ def test_hist_with_nans_and_weights(self): _, ax0 = mpl.pyplot.subplots() df.plot.hist(ax=ax0, weights=weights) - rects = [x for x in ax0.get_children() if isinstance(x, mpl_patches.Rectangle)] + rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)] heights = [rect.get_height() for rect in rects] _, ax1 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) no_nan_rects = [ - x for x in ax1.get_children() if isinstance(x, mpl_patches.Rectangle) + x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle) ] no_nan_heights = [rect.get_height() for rect in no_nan_rects] assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) @@ -663,8 +656,6 @@ def test_hist_with_nans_and_weights(self): class TestDataFrameGroupByPlots: def test_grouped_hist_legacy(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(10) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -716,10 +707,6 @@ def test_grouped_hist_legacy_single_key(self): _check_ticks_props(axes, xrot=30) def test_grouped_hist_legacy_grouped_hist_kwargs(self): - from matplotlib.patches import Rectangle - - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -748,14 +735,14 @@ def test_grouped_hist_legacy_grouped_hist_kwargs(self): ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [ + x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle) + ] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) def test_grouped_hist_legacy_grouped_hist(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -773,8 +760,6 @@ def test_grouped_hist_legacy_grouped_hist(self): _check_ax_scales(axes, yaxis="log") def test_grouped_hist_legacy_external_err(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( From 4526ea70f443af489959ba9d1233a51150109b10 Mon Sep 17 00:00:00 2001 From: undermyumbrella1 <120079323+undermyumbrella1@users.noreply.github.com> Date: Wed, 5 Jun 2024 02:20:13 +0800 Subject: [PATCH 59/66] Update compute_dict_like to get all columns (#58452) * Update compute_dict_like to get all columns * Add tests * Update rst * Remove newline from rst * Project the columns before converting to series group by * retrigger doc build * Account for 1d/series projection result * Declare var before assignment * Remove if condition * Add test to test agg list funcs --------- Co-authored-by: Kei Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/apply.py | 35 +++++- .../tests/groupby/aggregate/test_aggregate.py | 118 ++++++++++++++++++ 3 files changed, 149 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2d4fafc3c4e1e..7f5c879c0d9f5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -535,6 +535,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25836e967e948..33f506235870d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -471,8 +471,30 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = selected_obj + results, keys = [], [] + for key, how in func.items(): + cols = df[key] + + if cols.ndim == 1: + series_list = [obj._gotitem(key, ndim=1, subset=cols)] + else: + series_list = [] + for index in range(cols.shape[1]): + col = cols.iloc[:, index] + + series = obj._gotitem(key, ndim=1, subset=col) + series_list.append(series) + + for series in series_list: + result = getattr(series, op_name)(how, **kwargs) + results.append(result) + keys.append(key) + + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -496,11 +518,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -509,7 +534,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3362d6209af6d..1f140063fd84b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1662,3 +1662,121 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) From cc85e2c46a23be9fe57595e021024783590c682d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 09:33:06 -1000 Subject: [PATCH 60/66] CLN: Matplotlib imports in style (#58921) Use import_optional_dependency in style.py --- pandas/io/formats/style.py | 150 +++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 83 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 69021eb2656f6..8212b50594842 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4,7 +4,6 @@ from __future__ import annotations -from contextlib import contextmanager import copy from functools import partial import operator @@ -56,7 +55,6 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, Hashable, Sequence, ) @@ -84,22 +82,6 @@ from pandas import ExcelWriter -try: - import matplotlib as mpl - import matplotlib.pyplot as plt - - has_mpl = True -except ImportError: - has_mpl = False - - -@contextmanager -def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: - if has_mpl: - yield plt, mpl - else: - raise ImportError(f"{func.__name__} requires matplotlib.") - #### # Shared Doc Strings @@ -3832,61 +3814,61 @@ def _background_gradient( else: # else validate gmap against the underlying data gmap = _validate_apply_axis_arg(gmap, "gmap", float, data) - with _mpl(Styler.background_gradient) as (_, _matplotlib): - smin = np.nanmin(gmap) if vmin is None else vmin - smax = np.nanmax(gmap) if vmax is None else vmax - rng = smax - smin - # extend lower / upper bounds, compresses color range - norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + smin = np.nanmin(gmap) if vmin is None else vmin + smax = np.nanmax(gmap) if vmax is None else vmax + rng = smax - smin + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.background_gradient requires matplotlib." + ) + # extend lower / upper bounds, compresses color range + norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + + if cmap is None: + rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](norm(gmap)) + else: + rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) + + def relative_luminance(rgba) -> float: + """ + Calculate relative luminance of a color. + + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + + Parameters + ---------- + color : rgb or rgba tuple + + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 + for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b - if cmap is None: - rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]]( - norm(gmap) + def css(rgba, text_only) -> str: + if not text_only: + dark = relative_luminance(rgba) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return ( + f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" + f"color: {text_color};" ) else: - rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) - - def relative_luminance(rgba) -> float: - """ - Calculate relative luminance of a color. - - The calculation adheres to the W3C standards - (https://www.w3.org/WAI/GL/wiki/Relative_luminance) - - Parameters - ---------- - color : rgb or rgba tuple - - Returns - ------- - float - The relative luminance as a value from 0 to 1 - """ - r, g, b = ( - x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 - for x in rgba[:3] - ) - return 0.2126 * r + 0.7152 * g + 0.0722 * b - - def css(rgba, text_only) -> str: - if not text_only: - dark = relative_luminance(rgba) < text_color_threshold - text_color = "#f1f1f1" if dark else "#000000" - return ( - f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" - f"color: {text_color};" - ) - else: - return f"color: {_matplotlib.colors.rgb2hex(rgba)};" + return f"color: {_matplotlib.colors.rgb2hex(rgba)};" - if data.ndim == 1: - return [css(rgba, text_only) for rgba in rgbas] - else: - return DataFrame( - [[css(rgba, text_only) for rgba in row] for row in rgbas], - index=data.index, - columns=data.columns, - ) + if data.ndim == 1: + return [css(rgba, text_only) for rgba in rgbas] + else: + return DataFrame( + [[css(rgba, text_only) for rgba in row] for row in rgbas], + index=data.index, + columns=data.columns, + ) def _highlight_between( @@ -4124,20 +4106,22 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple rgbas = None if cmap is not None: # use the matplotlib colormap input - with _mpl(Styler.bar) as (_, _matplotlib): - cmap = ( - _matplotlib.colormaps[cmap] - if isinstance(cmap, str) - else cmap # assumed to be a Colormap instance as documented - ) - norm = _matplotlib.colors.Normalize(left, right) - rgbas = cmap(norm(values)) - if data.ndim == 1: - rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] - else: - rgbas = [ - [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas - ] + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.bar requires matplotlib." + ) + cmap = ( + _matplotlib.colormaps[cmap] + if isinstance(cmap, str) + else cmap # assumed to be a Colormap instance as documented + ) + norm = _matplotlib.colors.Normalize(left, right) + rgbas = cmap(norm(values)) + if data.ndim == 1: + rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] + else: + rgbas = [ + [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas + ] assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] if data.ndim == 1: From 92207feacb0e2bc398bbe30042b4d24578df12e9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:13:31 -1000 Subject: [PATCH 61/66] CLN: Remove downcast keyword in MultiIndex.fillna (#58923) --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8b11f8087db94..a8c05ab78c98e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1677,7 +1677,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value, downcast=None): + def fillna(self, value): """ fillna is not implemented for MultiIndex """ From 9e7abc84a11b283b71a0c8f012ca04eb79b8b882 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:14:46 -1000 Subject: [PATCH 62/66] CLN: Plotting testing asserters (#58922) --- pandas/_testing/__init__.py | 2 -- pandas/_testing/asserters.py | 22 ------------------- pandas/tests/plotting/common.py | 38 ++++++++++++++++++++++----------- 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index d35242ada21e9..85d03ea17bf42 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -57,7 +57,6 @@ assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, - assert_is_valid_plot_return_object, assert_metadata_equivalent, assert_numpy_array_equal, assert_period_array_equal, @@ -558,7 +557,6 @@ def shares_memory(left, right) -> bool: "assert_indexing_slices_equivalent", "assert_interval_array_equal", "assert_is_sorted", - "assert_is_valid_plot_return_object", "assert_metadata_equivalent", "assert_numpy_array_equal", "assert_period_array_equal", diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 430840711122a..1127a4512643c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -429,28 +429,6 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: return None -def assert_is_valid_plot_return_object(objs) -> None: - from matplotlib.artist import Artist - from matplotlib.axes import Axes - - if isinstance(objs, (Series, np.ndarray)): - if isinstance(objs, Series): - objs = objs._values - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {type(el).__name__!r}" - ) - assert isinstance(el, (Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{type(objs).__name__!r}" - ) - assert isinstance(objs, (Artist, tuple, dict)), msg - - def assert_is_sorted(seq) -> None: """Assert that the sequence is sorted.""" if isinstance(seq, (Index, Series)): diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 5a46cdcb051b6..d8c49d6d47f28 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -76,8 +76,6 @@ def _check_data(xp, rs): xp : matplotlib Axes object rs : matplotlib Axes object """ - import matplotlib.pyplot as plt - xp_lines = xp.get_lines() rs_lines = rs.get_lines() @@ -87,8 +85,6 @@ def _check_data(xp, rs): rsdata = rsl.get_xydata() tm.assert_almost_equal(xpdata, rsdata) - plt.close("all") - def _check_visible(collections, visible=True): """ @@ -495,6 +491,28 @@ def get_y_axis(ax): return ax._shared_axes["y"] +def assert_is_valid_plot_return_object(objs) -> None: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + + if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values + for el in objs.reshape(-1): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__!r}" + ) + assert isinstance(el, (Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{type(objs).__name__!r}" + ) + assert isinstance(objs, (Artist, tuple, dict)), msg + + def _check_plot_works(f, default_axes=False, **kwargs): """ Create plot and ensure that plot return object is valid. @@ -530,15 +548,11 @@ def _check_plot_works(f, default_axes=False, **kwargs): gen_plots = _gen_two_subplots ret = None - try: - fig = kwargs.get("figure", plt.gcf()) - plt.clf() - - for ret in gen_plots(f, fig, **kwargs): - tm.assert_is_valid_plot_return_object(ret) + fig = kwargs.get("figure", plt.gcf()) + fig.clf() - finally: - plt.close(fig) + for ret in gen_plots(f, fig, **kwargs): + assert_is_valid_plot_return_object(ret) return ret From f7590e6f5a31742142f9a06e144a0022058a572b Mon Sep 17 00:00:00 2001 From: Abel Tavares <121238257+abeltavares@users.noreply.github.com> Date: Wed, 5 Jun 2024 17:59:02 +0100 Subject: [PATCH 63/66] BUG/df.agg-with-df-with-missing-values-results-in-IndexError (#58864) * fix * improve and fix bug entry * update --------- Co-authored-by: 121238257 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/apply.py | 8 +++--- .../tests/groupby/aggregate/test_aggregate.py | 26 +++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7f5c879c0d9f5..802a9fd7e2099 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -456,6 +456,7 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 33f506235870d..2039386c4766c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1850,11 +1850,13 @@ def relabel_result( com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) - s = s.iloc[col_idx_order] - + valid_idx = col_idx_order != -1 + if valid_idx.any(): + s = s.iloc[col_idx_order[valid_idx]] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. - s.index = reordered_indexes[idx : idx + len(fun)] + if not s.empty: + s.index = reordered_indexes[idx : idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result_in_dict diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1f140063fd84b..26602baedb594 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -62,6 +62,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_agg_with_missing_values(): + # GH#58810 + missing_df = DataFrame( + { + "nan": [np.nan, np.nan, np.nan, np.nan], + "na": [pd.NA, pd.NA, pd.NA, pd.NA], + "nat": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "none": [None, None, None, None], + "values": [1, 2, 3, 4], + } + ) + + result = missing_df.agg(x=("nan", "min"), y=("na", "min"), z=("values", "sum")) + + expected = DataFrame( + { + "nan": [np.nan, np.nan, np.nan], + "na": [np.nan, np.nan, np.nan], + "values": [np.nan, np.nan, 10.0], + }, + index=["x", "y", "z"], + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_mixed_dtype(): # GH 6212 expected = DataFrame( From 811e6a43d192a4425e3c12b0a68efce96acef6cb Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:30:12 +0530 Subject: [PATCH 64/66] DOC: fix SA01 for pandas.read_spss (#58934) --- ci/code_checks.sh | 1 - pandas/io/spss.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 039700f306e03..33e072a26c94a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -474,7 +474,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_spss SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2c464cc7e90c4..313ffa79cbd09 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -52,6 +52,14 @@ def read_spss( DataFrame DataFrame based on the SPSS file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_sas : Read an SAS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP From 5fd883ac4f021440ef1b95704c0a193ad3bbc382 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:30:45 +0530 Subject: [PATCH 65/66] DOC: fix PR07 for pandas.pivot (#58935) --- ci/code_checks.sh | 1 - pandas/core/reshape/pivot.py | 153 +++++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 33e072a26c94a..705f01d74a972 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -469,7 +469,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge_asof PR07,RT03" \ -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ - -i "pandas.pivot PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 86da19f13bacf..ff993c039bf9a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -11,10 +11,6 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import ( - Appender, - Substitution, -) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -29,7 +25,6 @@ ) import pandas.core.common as com -from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, @@ -656,8 +651,6 @@ def _convert_by(by): return by -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot"], indents=1) def pivot( data: DataFrame, *, @@ -665,6 +658,152 @@ def pivot( index: IndexLabel | lib.NoDefault = lib.no_default, values: IndexLabel | lib.NoDefault = lib.no_default, ) -> DataFrame: + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + columns : str or object or a list of str + Column to use to make new frame's columns. + index : str or object or a list of str, optional + Column to use to make new frame's index. If not given, uses existing index. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... "zoo": ["x", "y", "z", "q", "w", "t"], + ... } + ... ) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index="foo", columns="bar", values="baz") + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar")["baz"] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar", values=["baz", "zoo"]) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame( + ... { + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5], + ... } + ... ) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two"], + ... "bar": ["A", "A", "B", "C"], + ... "baz": [1, 2, 3, 4], + ... } + ... ) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index="foo", columns="bar", values="baz") + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ columns_listlike = com.convert_to_list_like(columns) # If columns is None we will create a MultiIndex level with None as name From 0e90f66a88e0d4b16b143f1e563ec5fa3565469b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:31:11 +0530 Subject: [PATCH 66/66] DOC: fix PR07 for pandas.merge_ordered (#58936) --- ci/code_checks.sh | 1 - pandas/core/reshape/merge.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 705f01d74a972..ade7173cf0344 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -467,7 +467,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.json_normalize RT03,SA01" \ -i "pandas.merge PR07" \ -i "pandas.merge_asof PR07,RT03" \ - -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..ddf6bd3c70988 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -316,7 +316,9 @@ def merge_ordered( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like