Skip to content

Commit

Permalink
Enh: adding automated inferencing of format %Y-%m-%dT%H:%M in pyarr…
Browse files Browse the repository at this point in the history
…ow (#1292)

* add pyarrow time parsing with %H:%M format

* add test for format %H:%M

* add time format mapping

* add start and end characters matching

Co-authored-by: Francesco Bruzzesi <[email protected]>

---------

Co-authored-by: Francesco Bruzzesi <[email protected]>
  • Loading branch information
raisadz and FBruzzesi authored Nov 1, 2024
1 parent d5feb6f commit 6b5e2bf
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 11 deletions.
15 changes: 12 additions & 3 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,9 @@ def convert_str_slice_to_int_slice(
# Regex for date, time, separator and timezone components
DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
SEP_RE = r"(?P<sep>\s|T)"
TIME_RE = r"(?P<time>\d{2}:\d{2}:\d{2})" # \s*(?P<period>[AP]M)?)?
TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)" # \s*(?P<period>[AP]M)?)?
HMS_RE = r"^(?P<hms>\d{2}:\d{2}:\d{2})$"
HM_RE = r"^(?P<hm>\d{2}:\d{2})$"
TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})" # Matches 'Z', '+02:00', '+0200', '+02', etc.
FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"

Expand All @@ -354,6 +356,10 @@ def convert_str_slice_to_int_slice(
(DMY_RE, "%d-%m-%Y"),
(MDY_RE, "%m-%d-%Y"),
)
TIME_FORMATS = (
(HMS_RE, "%H:%M:%S"),
(HM_RE, "%H:%M"),
)


def parse_datetime_format(arr: pa.StringArray) -> str:
Expand Down Expand Up @@ -418,5 +424,8 @@ def _parse_date_format(arr: pa.Array) -> str:
def _parse_time_format(arr: pa.Array) -> str:
import pyarrow.compute as pc # ignore-banned-import

matches = pc.extract_regex(arr, pattern=TIME_RE)
return "%H:%M:%S" if pc.all(matches.is_valid()).as_py() else ""
for time_rgx, time_fmt in TIME_FORMATS:
matches = pc.extract_regex(arr, pattern=time_rgx)
if pc.all(matches.is_valid()).as_py():
return time_fmt
return ""
52 changes: 44 additions & 8 deletions tests/expr_and_series/str/to_datetime_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,29 @@ def test_to_datetime_series(constructor_eager: ConstructorEager) -> None:
assert str(result) == expected


def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
@pytest.mark.parametrize(
("data", "expected", "expected_cudf"),
[
(
{"a": ["2020-01-01T12:34:56"]},
"2020-01-01 12:34:56",
"2020-01-01T12:34:56.000000000",
),
(
{"a": ["2020-01-01T12:34"]},
"2020-01-01 12:34:00",
"2020-01-01T12:34:00.000000000",
),
],
)
def test_to_datetime_infer_fmt(
constructor: Constructor,
data: dict[str, list[str]],
expected: str,
expected_cudf: str,
) -> None:
if "cudf" in str(constructor): # pragma: no cover
expected = "2020-01-01T12:34:56.000000000"
else:
expected = "2020-01-01 12:34:56"
expected = expected_cudf

result = (
nw.from_native(constructor(data))
Expand All @@ -63,11 +81,29 @@ def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
assert str(result) == expected


def test_to_datetime_series_infer_fmt(constructor_eager: ConstructorEager) -> None:
@pytest.mark.parametrize(
("data", "expected", "expected_cudf"),
[
(
{"a": ["2020-01-01T12:34:56"]},
"2020-01-01 12:34:56",
"2020-01-01T12:34:56.000000000",
),
(
{"a": ["2020-01-01T12:34"]},
"2020-01-01 12:34:00",
"2020-01-01T12:34:00.000000000",
),
],
)
def test_to_datetime_series_infer_fmt(
constructor_eager: ConstructorEager,
data: dict[str, list[str]],
expected: str,
expected_cudf: str,
) -> None:
if "cudf" in str(constructor_eager): # pragma: no cover
expected = "2020-01-01T12:34:56.000000000"
else:
expected = "2020-01-01 12:34:56"
expected = expected_cudf

result = (
nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime()
Expand Down

0 comments on commit 6b5e2bf

Please sign in to comment.