Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stats for datetimes #3007

Open
wants to merge 51 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
79790e0
compute stats for datetimes
polinaeterna Jul 31, 2024
12f78cc
Merge branch 'main' into datetime-stats
polinaeterna Jul 31, 2024
851ec1b
fix typing
polinaeterna Jul 31, 2024
3347c13
add testcase
polinaeterna Aug 1, 2024
0340b54
moar tests: column with nulls and all nulls column
polinaeterna Aug 5, 2024
4cd6e0d
Merge branch 'main' into datetime-stats
polinaeterna Aug 5, 2024
434b2d8
add datetime to worker
polinaeterna Aug 8, 2024
2604587
add test
polinaeterna Aug 8, 2024
913f812
include timezone aware
polinaeterna Aug 9, 2024
06c1ae5
Merge branch 'main' into datetime-stats
polinaeterna Aug 12, 2024
7f7ecab
Merge branch 'main' into datetime-stats
polinaeterna Oct 14, 2024
d517393
refactor
polinaeterna Oct 14, 2024
7046d8b
fix
polinaeterna Oct 14, 2024
945dff0
do not typecheck dateutil
polinaeterna Oct 14, 2024
d91d365
Merge branch 'main' into datetime-stats
polinaeterna Dec 20, 2024
bdec2e4
fix
polinaeterna Dec 23, 2024
f9ffe82
more tests
polinaeterna Dec 23, 2024
d2c37c6
fix string to datetime conversion: add format inferring
polinaeterna Dec 26, 2024
658719e
fix style
polinaeterna Dec 26, 2024
5c2d94a
fix check for datetime
polinaeterna Dec 27, 2024
359a30b
minor
polinaeterna Dec 27, 2024
0744e07
mypy
polinaeterna Dec 27, 2024
53e2100
add testcase
polinaeterna Jan 6, 2025
a61108f
Merge branch 'main' into datetime-stats
polinaeterna Jan 7, 2025
c63e70e
Merge branch 'main' into datetime-stats
polinaeterna Jan 8, 2025
70197aa
Merge branch 'datetime-stats' of github.com:huggingface/datasets-serv…
polinaeterna Jan 8, 2025
3df6264
fix?
polinaeterna Jan 8, 2025
812bf36
add example to docs
polinaeterna Jan 8, 2025
c68efb7
fix + add tz string (%Z) to formats
polinaeterna Jan 9, 2025
351ef5c
test for string timezone
polinaeterna Jan 9, 2025
787ad3b
try to debug
polinaeterna Jan 10, 2025
5163500
test identify_datetime_format
polinaeterna Jan 10, 2025
033e29e
test datetime.strptime
polinaeterna Jan 13, 2025
349b651
test
polinaeterna Jan 13, 2025
6c60c27
Update services/worker/src/worker/statistics_utils.py
polinaeterna Jan 15, 2025
db10500
keep original timezone for string dates
polinaeterna Jan 15, 2025
8794b7a
let polars identify datetime format by itself
polinaeterna Jan 15, 2025
e0e7c91
do not display +0000 in timestamps (if timezone is UTC)
polinaeterna Jan 15, 2025
8afade1
remove utils test
polinaeterna Jan 15, 2025
341676c
refactor: identify datetime format manually only when polars failed
polinaeterna Jan 15, 2025
3b5d950
style
polinaeterna Jan 16, 2025
21977db
log formats in error message
polinaeterna Jan 16, 2025
0ee76bf
update openapi specs
polinaeterna Jan 16, 2025
b7fee0b
fallback to string stats if datetime didn't work
polinaeterna Jan 16, 2025
6a76dd9
fix test
polinaeterna Jan 16, 2025
f3eefea
update docs
polinaeterna Jan 16, 2025
a79eb79
Merge branch 'main' into datetime-stats
polinaeterna Jan 16, 2025
1df95ff
fix openapi specs
polinaeterna Jan 16, 2025
2f27846
Merge branch 'main' into datetime-stats
polinaeterna Jan 17, 2025
f9d7a8a
fix polars timezone switching
polinaeterna Jan 17, 2025
720aab9
Merge branch 'main' into datetime-stats
polinaeterna Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
AudioColumn,
BoolColumn,
ClassLabelColumn,
DatetimeColumn,
FloatColumn,
ImageColumn,
IntColumn,
Expand All @@ -57,7 +58,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):


SupportedColumns = Union[
ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
ClassLabelColumn,
IntColumn,
FloatColumn,
StringColumn,
BoolColumn,
ListColumn,
AudioColumn,
ImageColumn,
DatetimeColumn,
]


Expand Down Expand Up @@ -215,29 +224,34 @@ def _column_from_feature(
return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if isinstance(dataset_feature, dict):
if dataset_feature.get("_type") == "ClassLabel":
_type = dataset_feature.get("_type")
if _type == "ClassLabel":
return ClassLabelColumn(
feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
)

if dataset_feature.get("_type") == "Audio":
if _type == "Audio":
return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("_type") == "Image":
if _type == "Image":
return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("_type") == "Value":
if dataset_feature.get("dtype") in INTEGER_DTYPES:
if _type == "Value":
dtype = dataset_feature.get("dtype", "")
if dtype in INTEGER_DTYPES:
return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") in FLOAT_DTYPES:
if dtype in FLOAT_DTYPES:
return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") in STRING_DTYPES:
if dtype in STRING_DTYPES:
return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") == "bool":
if dtype == "bool":
return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dtype.startswith("timestamp"):
return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
return None

columns: list[SupportedColumns] = []
Expand All @@ -249,7 +263,7 @@ def _column_from_feature(
if not columns:
raise NoSupportedFeaturesError(
"No columns for statistics computation found. Currently supported feature types are: "
f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
)

column_names_str = ", ".join([column.name for column in columns])
Expand Down
120 changes: 117 additions & 3 deletions services/worker/src/worker/statistics_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.
import datetime
import enum
import io
import logging
Expand Down Expand Up @@ -50,24 +51,41 @@ class ColumnType(str, enum.Enum):
STRING_TEXT = "string_text"
AUDIO = "audio"
IMAGE = "image"
DATETIME = "datetime"


class Histogram(TypedDict):
hist: list[int]
bin_edges: list[Union[int, float]]


class DatetimeHistogram(TypedDict):
hist: list[int]
bin_edges: list[str] # edges are string representations of dates
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

orjson supports datetime serialization though, maybe I should return datetimes then?



class NumericalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
min: Optional[float] # might be None in very rare cases when the whole column is only None values
max: Optional[float]
min: Optional[Union[int, float]] # might be None in very rare cases when the whole column is only None values
max: Optional[Union[int, float]]
mean: Optional[float]
median: Optional[float]
std: Optional[float]
histogram: Optional[Histogram]


class DatetimeStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
min: Optional[str] # might be None in very rare cases when the whole column is only None values
max: Optional[str]
mean: Optional[str]
median: Optional[str]
std: Optional[str] # string representation of timedelta
histogram: Optional[DatetimeHistogram]


class CategoricalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
Expand All @@ -83,7 +101,9 @@ class BoolStatisticsItem(TypedDict):
frequencies: dict[str, int]


SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem]
SupportedStatistics = Union[
NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem
]


class StatisticsPerColumnItem(TypedDict):
Expand Down Expand Up @@ -699,3 +719,97 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No
@classmethod
def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]:
return cls.get_width(example)


class DatetimeColumn(Column):
transform_column = IntColumn

@classmethod
def compute_transformed_data(
cls,
data: pl.DataFrame,
column_name: str,
transformed_column_name: str,
min_date: datetime.datetime,
) -> pl.DataFrame:
return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

operate on seconds


@staticmethod
def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))

@classmethod
def _compute_statistics(
cls,
data: pl.DataFrame,
column_name: str,
n_samples: int,
) -> DatetimeStatisticsItem:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
return DatetimeStatisticsItem(
nan_count=n_samples,
nan_proportion=1.0,
min=None,
max=None,
mean=None,
median=None,
std=None,
histogram=None,
)

min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
# compute distribution of time passed from min date in **seconds**
timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics(
timedelta_df,
column_name=timedelta_column_name,
n_samples=n_samples,
)
# to assure mypy that there values are not None to pass to conversion functions:
assert timedelta_stats["histogram"] is not None # nosec
assert timedelta_stats["max"] is not None # nosec
assert timedelta_stats["mean"] is not None # nosec
assert timedelta_stats["median"] is not None # nosec
assert timedelta_stats["std"] is not None # nosec

datetime_bin_edges = [
cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
]

return DatetimeStatisticsItem(
nan_count=nan_count,
nan_proportion=nan_proportion,
min=datetime_to_string(min_date),
max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]),
mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]),
median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]),
std=str(datetime.timedelta(seconds=timedelta_stats["std"])),
histogram=DatetimeHistogram(
hist=timedelta_stats["histogram"]["hist"],
bin_edges=datetime_bin_edges,
),
)

def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
return StatisticsPerColumnItem(
column_name=self.name,
column_type=ColumnType.DATETIME,
column_statistics=stats,
)


def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
"""
Convert a datetime.datetime object to a string.

Args:
dt (datetime): The datetime object to convert.
format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".

Returns:
str: The datetime object as a string.
"""
return dt.strftime(format)
2 changes: 2 additions & 0 deletions services/worker/tests/fixtures/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from .statistics_dataset import (
audio_dataset,
datetime_dataset,
image_dataset,
null_column,
statistics_dataset,
Expand Down Expand Up @@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]:
"descriptive_statistics_not_supported": statistics_not_supported_dataset,
"audio_statistics": audio_dataset,
"image_statistics": image_dataset,
"datetime_statistics": datetime_dataset,
}
20 changes: 20 additions & 0 deletions services/worker/tests/fixtures/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str
delete_hub_dataset_repo(repo_id=repo_id)


@pytest.fixture(scope="session")
def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]:
repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"])
yield repo_id
delete_hub_dataset_repo(repo_id=repo_id)


@pytest.fixture(scope="session")
def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]:
default_config_name, _ = get_default_config_split()
Expand Down Expand Up @@ -1207,6 +1214,19 @@ def hub_responses_image_statistics(
}


@pytest.fixture
def hub_responses_datetime_statistics(
hub_public_datetime_statistics: str,
) -> HubDatasetTest:
return {
"name": hub_public_datetime_statistics,
"config_names_response": create_config_names_response(hub_public_datetime_statistics),
"splits_response": create_splits_response(hub_public_datetime_statistics),
"first_rows_response": None,
"parquet_and_info_response": None,
}


@pytest.fixture
def hub_responses_descriptive_statistics_parquet_builder(
hub_public_descriptive_statistics_parquet_builder: str,
Expand Down
55 changes: 55 additions & 0 deletions services/worker/tests/fixtures/statistics_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.

from datetime import datetime
from pathlib import Path
from typing import Optional

Expand Down Expand Up @@ -1698,3 +1699,57 @@ def null_column(n_samples: int) -> list[None]:
}
),
)


datetime_dataset = Dataset.from_dict(
{
"datetime": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
"datetime_tz": [
datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-03 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-04 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-05 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-06 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-07 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-08 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-09 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-10 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-11 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
],
"datetime_null": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
"datetime_all_null": [None] * 11,
},
features=Features(
{
"datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
}
),
)
Loading
Loading