Skip to content

Commit

Permalink
Inconsistent sorting - make more obvious #286 #290
Browse files Browse the repository at this point in the history
  • Loading branch information
mdancho84 committed Jul 16, 2024
1 parent 57fa0f6 commit e6506f3
Show file tree
Hide file tree
Showing 15 changed files with 136 additions and 7 deletions.
4 changes: 3 additions & 1 deletion docs/changelog-news.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ number-sections: false

## Improvements:

- `.augment_lags()` and `.augment_leads()`: value_column only accepts numeric dtype. Now accepts any dtype. #295

- Implement `sort_dataframe()`: This function is used internally to make sure Polars and Pandas engines perform grouped operations consistently and correctly. #286 #290
- `.augment_lags()` and `.augment_leads()`: value_column now accepts any dtype. #295

# pytimetk 0.4.0

Expand Down
2 changes: 1 addition & 1 deletion src/pytimetk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
reduce_memory_usage
)
from .utils.pandas_helpers import (
flatten_multiindex_column_names, glimpse, drop_zero_variance, transform_columns
flatten_multiindex_column_names, glimpse, drop_zero_variance, transform_columns, sort_dataframe
)
from .utils.parallel_helpers import (
parallel_apply, progress_apply
Expand Down
7 changes: 7 additions & 0 deletions src/pytimetk/core/anomalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.core.frequency import get_frequency, get_seasonal_frequency, get_trend_frequency

from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

from pytimetk.utils.parallel_helpers import parallel_apply, get_threads, progress_apply

Expand Down Expand Up @@ -282,6 +284,8 @@ def anomalize(

if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

if isinstance(data, pd.DataFrame):
result = _anomalize(
Expand Down Expand Up @@ -354,6 +358,9 @@ def anomalize(
if reduce_memory:
result = reduce_memory_usage(result)

result.index = idx_unsorted
result = result.sort_index()

return result

# Monkey patch the method to pandas groupby objects
Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_diffs(
Expand Down Expand Up @@ -130,6 +131,8 @@ def augment_diffs(
if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

if engine == 'pandas':
ret = _augment_diffs_pandas(data, date_column, value_column, periods, normalize=normalize)
elif engine == 'polars':
Expand All @@ -140,6 +143,9 @@ def augment_diffs(
if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

# Monkey patch the method to pandas groupby objects
Expand Down
6 changes: 1 addition & 5 deletions src/pytimetk/feature_engineering/ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,9 @@ def augment_ewm(
display(ewm_df)
```
"""
# Ensure data is a DataFrame or a GroupBy object
# Checks
check_dataframe_or_groupby(data)

# Ensure date column exists and is properly formatted
check_date_column(data, date_column)

# Ensure value column(s) exist
check_value_column(data, value_column)

# Convert string value column to list for consistency
Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pytimetk.utils.parallel_helpers import conditional_tqdm, get_threads
from pytimetk.utils.polars_helpers import update_dict
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_expanding(
Expand Down Expand Up @@ -214,6 +215,8 @@ def augment_expanding(
if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

# Convert string value column to list for consistency
if isinstance(value_column, str):
value_column = [value_column]
Expand Down Expand Up @@ -257,6 +260,9 @@ def augment_expanding(
if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret


Expand Down
7 changes: 7 additions & 0 deletions src/pytimetk/feature_engineering/fourier.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from pytimetk.core.ts_summary import ts_summary
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe


@pf.register_dataframe_method
Expand Down Expand Up @@ -112,6 +113,9 @@ def augment_fourier(
check_dataframe_or_groupby(data)
check_date_column(data, date_column)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)


if isinstance(periods, int):
periods = [periods]
elif isinstance(periods, tuple):
Expand All @@ -133,6 +137,9 @@ def augment_fourier(

if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

Expand Down
7 changes: 7 additions & 0 deletions src/pytimetk/feature_engineering/hilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.polars_helpers import pandas_to_polars_frequency, pandas_to_polars_aggregation_mapping
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_hilbert(
Expand Down Expand Up @@ -157,6 +158,9 @@ def augment_hilbert(
if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)


if engine == 'pandas':
ret = _augment_hilbert_pandas(data, date_column, value_column)
elif engine == 'polars':
Expand All @@ -167,6 +171,9 @@ def augment_hilbert(
if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret


Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/holiday_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_series_or_datetime, check_installed
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe


@pf.register_dataframe_method
Expand Down Expand Up @@ -204,6 +205,8 @@ def augment_holiday_signature(

if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

if engine == 'pandas':
ret = _augment_holiday_signature_pandas(data, date_column, country_name)
Expand All @@ -215,6 +218,9 @@ def augment_holiday_signature(
if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

# Monkey patch the method to pandas groupby objects
Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/lags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_lags(
Expand Down Expand Up @@ -123,6 +124,8 @@ def augment_lags(

if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

if engine == 'pandas':
ret = _augment_lags_pandas(data, date_column, value_column, lags)
Expand All @@ -133,6 +136,9 @@ def augment_lags(

if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/leads.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_leads(
Expand Down Expand Up @@ -124,6 +125,8 @@ def augment_leads(

if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

if engine == 'pandas':
ret = _augment_leads_pandas(data, date_column, value_column, leads)
Expand All @@ -134,6 +137,9 @@ def augment_leads(

if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

Expand Down
6 changes: 6 additions & 0 deletions src/pytimetk/feature_engineering/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pytimetk.utils.parallel_helpers import conditional_tqdm, get_threads
from pytimetk.utils.polars_helpers import update_dict
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe

@pf.register_dataframe_method
def augment_rolling(
Expand Down Expand Up @@ -201,6 +202,8 @@ def augment_rolling(
if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

# Convert string value column to list for consistency
if isinstance(value_column, str):
value_column = [value_column]
Expand Down Expand Up @@ -252,6 +255,9 @@ def augment_rolling(

if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

Expand Down
7 changes: 7 additions & 0 deletions src/pytimetk/feature_engineering/wavelet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pytimetk.utils.checks import check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.polars_helpers import pandas_to_polars_frequency, pandas_to_polars_aggregation_mapping
from pytimetk.utils.memory_helpers import reduce_memory_usage
from pytimetk.utils.pandas_helpers import sort_dataframe



#@pf.register_dataframe_method
Expand Down Expand Up @@ -197,6 +199,8 @@ def augment_wavelet(

if reduce_memory:
data = reduce_memory_usage(data)

data, idx_unsorted = sort_dataframe(data, date_column, keep_grouped_df = True)

wavelet_functions = {
'morlet': morlet_wavelet,
Expand Down Expand Up @@ -242,6 +246,9 @@ def _apply_cwt(df):

if reduce_memory:
ret = reduce_memory_usage(ret)

ret.index = idx_unsorted
ret = ret.sort_index()

return ret

Expand Down
3 changes: 3 additions & 0 deletions src/pytimetk/utils/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,7 @@ def check_installed(package_name: str):
# data = data.groupby(group_names)

# return data




64 changes: 64 additions & 0 deletions src/pytimetk/utils/pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,70 @@ def make_lalign_formatter(df, cols=None):

return None


@pf.register_dataframe_method
def sort_dataframe(
data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy],
date_column: str,
keep_grouped_df: bool = True,
) -> Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]:
'''The function `sort_dataframe` sorts a DataFrame by a specified date column, handling both regular
DataFrames and grouped DataFrames.
Parameters
----------
data : Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]
The `data` parameter in the `sort_dataframe` function can accept either a pandas DataFrame or a
grouped DataFrame (DataFrameGroupBy object).
date_column
The `date_column` parameter in the `sort_dataframe` method is used to specify the column in the
DataFrame by which the sorting will be performed. This column contains dates that will be used as
the basis for sorting the DataFrame or DataFrameGroupBy object.
keep_grouped_df
If `True` and `data` is a grouped data frame, a grouped data frame will be returned. If `False`, an ungrouped data frame is returned.
Returns
-------
The `sort_dataframe` function returns a sorted DataFrame based on the specified date column. If the
input data is a regular DataFrame, it sorts the DataFrame by the specified date column. If the input
data is a grouped DataFrame (DataFrameGroupBy object), it sorts the DataFrame by the group names and
the specified date column. The function returns the sorted DataFrame.
Examples
--------
```{python}
import pytimetk as tk
import pandas as pd
df = tk.load_dataset('walmart_sales_weekly', parse_dates=['Date'])
df.sort_dataframe('Date')
df.groupby('id').sort_dataframe('Date').obj
df.groupby(['id', 'Store', 'Dept']).sort_dataframe('Date').obj
```
'''

group_names = None
if isinstance(data, pd.DataFrame):
df = data.copy()
df.sort_values(by=[date_column], inplace=True)
index_after_sort = df.index

if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy):
group_names = data.grouper.names
df = data.obj.copy()
df.sort_values(by=[*group_names, date_column], inplace=True)
index_after_sort = df.index
if keep_grouped_df:
df = df.groupby(group_names)

return df, index_after_sort

pd.core.groupby.generic.DataFrameGroupBy.sort_dataframe = sort_dataframe

@pf.register_dataframe_method
def drop_zero_variance(data: pd.DataFrame, ):
'''The function `drop_zero_variance` takes a pandas DataFrame as input and returns a new DataFrame with
Expand Down

0 comments on commit e6506f3

Please sign in to comment.