From 0e2099089406c6d5616bf9e8872154fee4960ea7 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Wed, 20 Nov 2024 22:47:33 +0800 Subject: [PATCH 1/5] BUG: fix to_datetime with np.datetime64[ps] giving wrong conversion (#60342) --- doc/source/whatsnew/v3.0.0.rst | 1 + .../_libs/src/vendored/numpy/datetime/np_datetime.c | 11 ++++++----- pandas/tests/tools/test_to_datetime.py | 9 +++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7da2f968b900b..5f7aed8ed9786 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -626,6 +626,7 @@ Datetimelike - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) +- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index cc65f34d6b6fe..9a022095feee9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -660,11 +660,12 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL); - out->ps = (npy_int32)(dt * 1000); + out->hour = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->ps = (npy_int32)(dt); break; case NPY_FR_fs: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a9d3c235f63f6..b73839f406a29 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3668,3 +3668,12 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) + + +def test_to_datetime_wrapped_datetime64_ps(): + # GH#60341 + result = to_datetime([np.datetime64(1901901901901, "ps")]) + expected = DatetimeIndex( + ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None + ) + tm.assert_index_equal(result, expected) From ff53ca1486dd10b0f2883987f082a79f3a55c409 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 21 Nov 2024 00:21:30 +0530 Subject: [PATCH 2/5] DOC: fix SA01 for pandas.errors.AttributeConflictWarning (#60367) * DOC: fix SA01 for pandas.errors.AttributeConflictWarning * DOC: fix SA01 for pandas.errors.AttributeConflictWarning --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 53690e9b78b8a..fe45ce02d5e44 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -114,7 +114,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.ChainedAssignmentError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index cacbfb49c311f..84f7239c6549d 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -672,6 +672,12 @@ class AttributeConflictWarning(Warning): name than the existing index on an HDFStore or attempting to append an index with a different frequency than the existing index on an HDFStore. + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore. + read_hdf : Read from an HDF5 file into a DataFrame. + Examples -------- >>> idx1 = pd.Index(["a", "b"], name="name1") From 24df015ad4ada9f58e6874b54737e579a62a7a53 Mon Sep 17 00:00:00 2001 From: ensalada-de-pechuga <127701043+ensalada-de-pechuga@users.noreply.github.com> Date: Thu, 21 Nov 2024 03:55:02 +0900 Subject: [PATCH 3/5] DOC: Fix docstrings for SeriesGroupBy monotonic and nth (#60375) * fix docstrings and remove from code_checks.sh * fix SeriesGroupBy.is_monotonic_decreasing See Also section (decreasing -> increasing) * remove DataFrameGroupBy.nth from code_checks.sh --------- Co-authored-by: root --- ci/code_checks.sh | 4 ---- pandas/core/groupby/generic.py | 10 ++++++++++ pandas/core/groupby/groupby.py | 13 ------------- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fe45ce02d5e44..633d767c63037 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -92,15 +92,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5ba382bf66bb7..35ec09892ede6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1443,6 +1443,11 @@ def is_monotonic_increasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_decreasing : Return whether each group's values + are monotonically decreasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) @@ -1462,6 +1467,11 @@ def is_monotonic_decreasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_increasing : Return whether each group's values + are monotonically increasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c30132347111..ad23127ad449f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3983,19 +3983,6 @@ def nth(self) -> GroupByNthSelector: 'all' or 'any'; this is equivalent to calling dropna(how=dropna) before the groupby. - Parameters - ---------- - n : int, slice or list of ints and slices - A single nth value for the row or a list of nth values or slices. - - .. versionchanged:: 1.4.0 - Added slice and lists containing slices. - Added index notation. - - dropna : {'any', 'all', None}, default None - Apply the specified dropna operation before counting which row is - the nth row. Only supported if n is an int. - Returns ------- Series or DataFrame From 72ab3fdc7a3530b885a466db88bbb38de8d5c6b9 Mon Sep 17 00:00:00 2001 From: Ivruix <52746744+Ivruix@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:00:08 +0300 Subject: [PATCH 4/5] DOC: fix docstring validation errors for pandas.Series.dt.freq (#60377) * Added docs for Series.dt.freq and removed from ci/code_checks.sh * Fix code style --- ci/code_checks.sh | 1 - pandas/core/indexes/accessors.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 633d767c63037..379f7cb5f037d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index e2dc71f68a65b..c404323a1168c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -373,6 +373,28 @@ def to_pydatetime(self) -> Series: @property def freq(self): + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + See Also + -------- + Series.dt.to_period : Cast to PeriodArray/PeriodIndex at a particular + frequency. + + Examples + -------- + >>> ser = pd.Series(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + 'D' + + >>> ser = pd.Series(["2022-01-01", "2024-01-01", "2026-01-01", "2028-01-01"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + '2YS-JAN' + """ return self._get_values().inferred_freq def isocalendar(self) -> DataFrame: From 1c986d6213904fd7d9acc5622dc91d029d3f1218 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 20 Nov 2024 23:52:11 -0800 Subject: [PATCH 5/5] ENH: expose `to_pandas_kwargs` in `read_parquet` with pyarrow backend (#59654) Co-authored-by: Joseph Kleinhenz Co-authored-by: Xiao Yuan Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/_util.py | 5 ++++- pandas/io/parquet.py | 22 ++++++++++++++++++++-- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5f7aed8ed9786..fbf2bed550c85 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 21203ad036fc6..9778a404e23e0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -60,9 +60,12 @@ def arrow_table_to_pandas( table: pyarrow.Table, dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + types_mapper: type[pd.ArrowDtype] | None | Callable if dtype_backend == "numpy_nullable": mapping = _arrow_dtype_mapping() @@ -80,5 +83,5 @@ def arrow_table_to_pandas( else: raise NotImplementedError - df = table.to_pandas(types_mapper=types_mapper) + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) return df diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 116f228faca93..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -242,6 +242,7 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True @@ -266,7 +267,11 @@ def read( "make_block is deprecated", DeprecationWarning, ) - result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -347,6 +352,7 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} @@ -362,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -452,7 +462,7 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- @@ -491,6 +501,7 @@ def read_parquet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ @@ -564,6 +575,12 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. @@ -636,5 +653,6 @@ def read_parquet( storage_options=storage_options, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 31cdb6626d237..7919bb956dc7a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1172,6 +1172,20 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) + def test_maps_as_pydicts(self, pa): + pyarrow = pytest.importorskip("pyarrow", "13.0.0") + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, + ) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full, request):