Enh: adding automated inferencing of format %Y-%m-%dT%H:%M in pyarr…

…ow (#1292) * add pyarrow time parsing with %H:%M format * add test for format %H:%M * add time format mapping * add start and end characters matching Co-authored-by: Francesco Bruzzesi <[email protected]> --------- Co-authored-by: Francesco Bruzzesi <[email protected]>
narwhals-dev · Nov 1, 2024 · 6b5e2bf · 6b5e2bf
1 parent d5feb6f
commit 6b5e2bf
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 11 deletions.
diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py
@@ -340,7 +340,9 @@ def convert_str_slice_to_int_slice(
 # Regex for date, time, separator and timezone components
 DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
 SEP_RE = r"(?P<sep>\s|T)"
-TIME_RE = r"(?P<time>\d{2}:\d{2}:\d{2})"  # \s*(?P<period>[AP]M)?)?
+TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)"  # \s*(?P<period>[AP]M)?)?
+HMS_RE = r"^(?P<hms>\d{2}:\d{2}:\d{2})$"
+HM_RE = r"^(?P<hm>\d{2}:\d{2})$"
 TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})"  # Matches 'Z', '+02:00', '+0200', '+02', etc.
 FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"
 
@@ -354,6 +356,10 @@ def convert_str_slice_to_int_slice(
     (DMY_RE, "%d-%m-%Y"),
     (MDY_RE, "%m-%d-%Y"),
 )
+TIME_FORMATS = (
+    (HMS_RE, "%H:%M:%S"),
+    (HM_RE, "%H:%M"),
+)
 
 
 def parse_datetime_format(arr: pa.StringArray) -> str:
@@ -418,5 +424,8 @@ def _parse_date_format(arr: pa.Array) -> str:
 def _parse_time_format(arr: pa.Array) -> str:
     import pyarrow.compute as pc  # ignore-banned-import
 
-    matches = pc.extract_regex(arr, pattern=TIME_RE)
-    return "%H:%M:%S" if pc.all(matches.is_valid()).as_py() else ""
+    for time_rgx, time_fmt in TIME_FORMATS:
+        matches = pc.extract_regex(arr, pattern=time_rgx)
+        if pc.all(matches.is_valid()).as_py():
+            return time_fmt
+    return ""
diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py
@@ -47,11 +47,29 @@ def test_to_datetime_series(constructor_eager: ConstructorEager) -> None:
     assert str(result) == expected
 
 
-def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
+@pytest.mark.parametrize(
+    ("data", "expected", "expected_cudf"),
+    [
+        (
+            {"a": ["2020-01-01T12:34:56"]},
+            "2020-01-01 12:34:56",
+            "2020-01-01T12:34:56.000000000",
+        ),
+        (
+            {"a": ["2020-01-01T12:34"]},
+            "2020-01-01 12:34:00",
+            "2020-01-01T12:34:00.000000000",
+        ),
+    ],
+)
+def test_to_datetime_infer_fmt(
+    constructor: Constructor,
+    data: dict[str, list[str]],
+    expected: str,
+    expected_cudf: str,
+) -> None:
     if "cudf" in str(constructor):  # pragma: no cover
-        expected = "2020-01-01T12:34:56.000000000"
-    else:
-        expected = "2020-01-01 12:34:56"
+        expected = expected_cudf
 
     result = (
         nw.from_native(constructor(data))
@@ -63,11 +81,29 @@ def test_to_datetime_infer_fmt(constructor: Constructor) -> None:
     assert str(result) == expected
 
 
-def test_to_datetime_series_infer_fmt(constructor_eager: ConstructorEager) -> None:
+@pytest.mark.parametrize(
+    ("data", "expected", "expected_cudf"),
+    [
+        (
+            {"a": ["2020-01-01T12:34:56"]},
+            "2020-01-01 12:34:56",
+            "2020-01-01T12:34:56.000000000",
+        ),
+        (
+            {"a": ["2020-01-01T12:34"]},
+            "2020-01-01 12:34:00",
+            "2020-01-01T12:34:00.000000000",
+        ),
+    ],
+)
+def test_to_datetime_series_infer_fmt(
+    constructor_eager: ConstructorEager,
+    data: dict[str, list[str]],
+    expected: str,
+    expected_cudf: str,
+) -> None:
     if "cudf" in str(constructor_eager):  # pragma: no cover
-        expected = "2020-01-01T12:34:56.000000000"
-    else:
-        expected = "2020-01-01 12:34:56"
+        expected = expected_cudf
 
     result = (
         nw.from_native(constructor_eager(data), eager_only=True)["a"].str.to_datetime()