From f349cb2a944316d0a10153cc05d86db855d43562 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:49:41 +0100 Subject: [PATCH] RFC, feat: infer datetime format for pyarrow backend (#1195) * feat: infer datetime format for pyarrow * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix for date format * use first 10 non null values only to infer format * test with null --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- narwhals/_arrow/series.py | 4 +- narwhals/_arrow/utils.py | 85 +++++++++++++++++++ tests/expr_and_series/str/to_datetime_test.py | 64 +++++++++++--- 3 files changed, 139 insertions(+), 14 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index be1377b4d..70009df43 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -12,6 +12,7 @@ from narwhals._arrow.utils import floordiv_compat from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype +from narwhals._arrow.utils import parse_datetime_format from narwhals._arrow.utils import validate_column_comparand from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name @@ -1115,8 +1116,7 @@ def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002 import pyarrow.compute as pc # ignore-banned-import() if format is None: - msg = "`format` is required for pyarrow backend." - raise ValueError(msg) + format = parse_datetime_format(self._arrow_series._native_series) return self._arrow_series._from_native_series( pc.strptime(self._arrow_series._native_series, format=format, unit="us") diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 7f6fa6558..bc8d17010 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -335,3 +335,88 @@ def convert_str_slice_to_int_slice( stop = columns.index(str_slice.stop) + 1 if str_slice.stop is not None else None step = str_slice.step return (start, stop, step) + + +# Regex for date, time, separator and timezone components +DATE_RE = r"(?P\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})" +SEP_RE = r"(?P\s|T)" +TIME_RE = r"(?P