huggingface · polinaeterna · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -39,6 +39,7 @@
     AudioColumn,
     BoolColumn,
     ClassLabelColumn,
+    DatetimeColumn,
     FloatColumn,
     ImageColumn,
     IntColumn,
@@ -57,7 +58,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):
 
 
 SupportedColumns = Union[
-    ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
+    ClassLabelColumn,
+    IntColumn,
+    FloatColumn,
+    StringColumn,
+    BoolColumn,
+    ListColumn,
+    AudioColumn,
+    ImageColumn,
+    DatetimeColumn,
 ]
 
 
@@ -215,29 +224,34 @@ def _column_from_feature(
                 return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
         if isinstance(dataset_feature, dict):
-            if dataset_feature.get("_type") == "ClassLabel":
+            _type = dataset_feature.get("_type")
+            if _type == "ClassLabel":
                 return ClassLabelColumn(
                     feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
                 )
 
-            if dataset_feature.get("_type") == "Audio":
+            if _type == "Audio":
                 return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Image":
+            if _type == "Image":
                 return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Value":
-                if dataset_feature.get("dtype") in INTEGER_DTYPES:
+            if _type == "Value":
+                dtype = dataset_feature.get("dtype", "")
+                if dtype in INTEGER_DTYPES:
                     return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in FLOAT_DTYPES:
+                if dtype in FLOAT_DTYPES:
                     return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in STRING_DTYPES:
+                if dtype in STRING_DTYPES:
                     return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") == "bool":
+                if dtype == "bool":
                     return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
+
+                if dtype.startswith("timestamp"):
+                    return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
         return None
 
     columns: list[SupportedColumns] = []
@@ -249,7 +263,7 @@ def _column_from_feature(
     if not columns:
         raise NoSupportedFeaturesError(
             "No columns for statistics computation found. Currently supported feature types are: "
-            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
+            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
         )
 
     column_names_str = ", ".join([column.name for column in columns])

diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The HuggingFace Authors.
+import datetime
 import enum
 import io
 import logging
@@ -50,24 +51,41 @@ class ColumnType(str, enum.Enum):
     STRING_TEXT = "string_text"
     AUDIO = "audio"
     IMAGE = "image"
+    DATETIME = "datetime"
 
 
 class Histogram(TypedDict):
     hist: list[int]
     bin_edges: list[Union[int, float]]
 
 
+class DatetimeHistogram(TypedDict):
+    hist: list[int]
+    bin_edges: list[str]  # edges are string representations of dates
+
+
 class NumericalStatisticsItem(TypedDict):
     nan_count: int
     nan_proportion: float
-    min: Optional[float]  # might be None in very rare cases when the whole column is only None values
-    max: Optional[float]
+    min: Optional[Union[int, float]]  # might be None in very rare cases when the whole column is only None values
+    max: Optional[Union[int, float]]
     mean: Optional[float]
     median: Optional[float]
     std: Optional[float]
     histogram: Optional[Histogram]
 
 
+class DatetimeStatisticsItem(TypedDict):
+    nan_count: int
+    nan_proportion: float
+    min: Optional[str]  # might be None in very rare cases when the whole column is only None values
+    max: Optional[str]
+    mean: Optional[str]
+    median: Optional[str]
+    std: Optional[str]  # string representation of timedelta
+    histogram: Optional[DatetimeHistogram]
+
+
 class CategoricalStatisticsItem(TypedDict):
     nan_count: int
     nan_proportion: float
@@ -83,7 +101,9 @@ class BoolStatisticsItem(TypedDict):
     frequencies: dict[str, int]
 
 
-SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem]
+SupportedStatistics = Union[
+    NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem
+]
 
 
 class StatisticsPerColumnItem(TypedDict):
@@ -699,3 +719,97 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No
     @classmethod
     def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]:
         return cls.get_width(example)
+
+
+class DatetimeColumn(Column):
+    transform_column = IntColumn
+
+    @classmethod
+    def compute_transformed_data(
+        cls,
+        data: pl.DataFrame,
+        column_name: str,
+        transformed_column_name: str,
+        min_date: datetime.datetime,
+    ) -> pl.DataFrame:
+        return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
+
+    @staticmethod
+    def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
+        return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))
+
+    @classmethod
+    def _compute_statistics(
+        cls,
+        data: pl.DataFrame,
+        column_name: str,
+        n_samples: int,
+    ) -> DatetimeStatisticsItem:
+        nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
+        if nan_count == n_samples:  # all values are None
+            return DatetimeStatisticsItem(
+                nan_count=n_samples,
+                nan_proportion=1.0,
+                min=None,
+                max=None,
+                mean=None,
+                median=None,
+                std=None,
+                histogram=None,
+            )
+
+        min_date: datetime.datetime = data[column_name].min()  # type: ignore   # mypy infers type of datetime column .min() incorrectly
+        timedelta_column_name = f"{column_name}_timedelta"
+        # compute distribution of time passed from min date in **seconds**
+        timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
+        timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics(
+            timedelta_df,
+            column_name=timedelta_column_name,
+            n_samples=n_samples,
+        )
+        # to assure mypy that there values are not None to pass to conversion functions:
+        assert timedelta_stats["histogram"] is not None  # nosec
+        assert timedelta_stats["max"] is not None  # nosec
+        assert timedelta_stats["mean"] is not None  # nosec
+        assert timedelta_stats["median"] is not None  # nosec
+        assert timedelta_stats["std"] is not None  # nosec
+
+        datetime_bin_edges = [
+            cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
+        ]
+
+        return DatetimeStatisticsItem(
+            nan_count=nan_count,
+            nan_proportion=nan_proportion,
+            min=datetime_to_string(min_date),
+            max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]),
+            mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]),
+            median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]),
+            std=str(datetime.timedelta(seconds=timedelta_stats["std"])),
+            histogram=DatetimeHistogram(
+                hist=timedelta_stats["histogram"]["hist"],
+                bin_edges=datetime_bin_edges,
+            ),
+        )
+
+    def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
+        stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
+        return StatisticsPerColumnItem(
+            column_name=self.name,
+            column_type=ColumnType.DATETIME,
+            column_statistics=stats,
+        )
+
+
+def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
+    """
+    Convert a datetime.datetime object to a string.
+
+    Args:
+        dt (datetime): The datetime object to convert.
+        format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".
+
+    Returns:
+        str: The datetime object as a string.
+    """
+    return dt.strftime(format)
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
@@ -28,6 +28,7 @@
 
 from .statistics_dataset import (
     audio_dataset,
+    datetime_dataset,
     image_dataset,
     null_column,
     statistics_dataset,
@@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]:
         "descriptive_statistics_not_supported": statistics_not_supported_dataset,
         "audio_statistics": audio_dataset,
         "image_statistics": image_dataset,
+        "datetime_statistics": datetime_dataset,
     }
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
@@ -354,6 +354,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str
     delete_hub_dataset_repo(repo_id=repo_id)
 
 
+@pytest.fixture(scope="session")
+def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]:
+    repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"])
+    yield repo_id
+    delete_hub_dataset_repo(repo_id=repo_id)
+
+
 @pytest.fixture(scope="session")
 def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]:
     default_config_name, _ = get_default_config_split()
@@ -1207,6 +1214,19 @@ def hub_responses_image_statistics(
     }
 
 
+@pytest.fixture
+def hub_responses_datetime_statistics(
+    hub_public_datetime_statistics: str,
+) -> HubDatasetTest:
+    return {
+        "name": hub_public_datetime_statistics,
+        "config_names_response": create_config_names_response(hub_public_datetime_statistics),
+        "splits_response": create_splits_response(hub_public_datetime_statistics),
+        "first_rows_response": None,
+        "parquet_and_info_response": None,
+    }
+
+
 @pytest.fixture
 def hub_responses_descriptive_statistics_parquet_builder(
     hub_public_descriptive_statistics_parquet_builder: str,

diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The HuggingFace Authors.
 
+from datetime import datetime
 from pathlib import Path
 from typing import Optional
 
@@ -1698,3 +1699,57 @@ def null_column(n_samples: int) -> list[None]:
         }
     ),
 )
+
+
+datetime_dataset = Dataset.from_dict(
+    {
+        "datetime": [
+            datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        ],
+        "datetime_tz": [
+            datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-03 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-04 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-05 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-06 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-07 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-08 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-09 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-10 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-11 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+        ],
+        "datetime_null": [
+            datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        ],
+        "datetime_all_null": [None] * 11,
+    },
+    features=Features(
+        {
+            "datetime": Value("timestamp[s]"),
+            "datetime_tz": Value("timestamp[s, tz=+02:00]"),
+            "datetime_null": Value("timestamp[s]"),
+            "datetime_all_null": Value("timestamp[s]"),
+        }
+    ),
+)