From 77ef1b7a3b4d32aaf049028976a00643d5b0c2f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:06:57 -0700 Subject: [PATCH 1/2] Raise NotImplementedError in to_datetime with dayfirst without infer_format --- python/cudf/cudf/core/tools/datetimes.py | 11 ++++---- python/cudf/cudf/tests/test_datetime.py | 34 ++++++++++++++++++------ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f736e055163..a3f4bacf206 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): format=format, ) else: - if infer_datetime_format and format is None: + if format is None: + if not infer_datetime_format and dayfirst: + raise NotImplementedError( + f"{dayfirst=} not implemented " + f"when {format=} and {infer_datetime_format=}." + ) format = column.datetime.infer_format( element=col.element_indexing(0), dayfirst=dayfirst, ) - elif format is None: - format = column.datetime.infer_format( - element=col.element_indexing(0) - ) return col.as_datetime_column( dtype=_unit_dtype_map[unit], format=format, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4c20258ae67..e8ee38dad6a 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -617,22 +617,40 @@ def test_datetime_dataframe(): @pytest.mark.parametrize("infer_datetime_format", [True, False]) def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): pd_data = data + is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) + is_string_data = gd_data.ndim == 1 and gd_data.dtype.kind == "O" else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data + is_string_data = isinstance(gd_data, list) and isinstance( + next(iter(gd_data), None), str + ) - expected = pd.to_datetime( - pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - - assert_eq(actual, expected) + if dayfirst and not infer_datetime_format and is_string_data: + # Note: pandas<2.0 also does not respect dayfirst=True correctly + # for object data + with pytest.raises(NotImplementedError): + cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + else: + expected = pd.to_datetime( + pd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + actual = cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + assert_eq(actual, expected) @pytest.mark.parametrize( From 940313f6ec78742e5917d87bb905b600b60023d0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Sep 2023 11:25:37 -0700 Subject: [PATCH 2/2] Fix condition --- python/cudf/cudf/tests/test_datetime.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index e8ee38dad6a..a0b4c8ec667 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -620,7 +620,11 @@ def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) - is_string_data = gd_data.ndim == 1 and gd_data.dtype.kind == "O" + is_string_data = ( + gd_data.ndim == 1 + and not gd_data.empty + and gd_data.dtype.kind == "O" + ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data)