diff --git a/dist/pytimetk-0.3.0.9004.tar.gz b/dist/pytimetk-0.3.0.9004.tar.gz deleted file mode 100644 index 522dd8be..00000000 Binary files a/dist/pytimetk-0.3.0.9004.tar.gz and /dev/null differ diff --git a/dist/pytimetk-0.3.0.9004-py3-none-any.whl b/dist/pytimetk-0.4.0-py3-none-any.whl similarity index 99% rename from dist/pytimetk-0.3.0.9004-py3-none-any.whl rename to dist/pytimetk-0.4.0-py3-none-any.whl index 5767c1b4..41a0f9c4 100644 Binary files a/dist/pytimetk-0.3.0.9004-py3-none-any.whl and b/dist/pytimetk-0.4.0-py3-none-any.whl differ diff --git a/dist/pytimetk-0.4.0.tar.gz b/dist/pytimetk-0.4.0.tar.gz new file mode 100644 index 00000000..08136da8 Binary files /dev/null and b/dist/pytimetk-0.4.0.tar.gz differ diff --git a/pyproject.toml b/pyproject.toml index 5b021535..80785aff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pytimetk" -version = "0.3.0.9004" +version = "0.4.0" description = "The time series toolkit for Python." authors = [ "Business Science ", diff --git a/src/pytimetk/crossvalidation/time_series_cv.py b/src/pytimetk/crossvalidation/time_series_cv.py new file mode 100644 index 00000000..a3a93413 --- /dev/null +++ b/src/pytimetk/crossvalidation/time_series_cv.py @@ -0,0 +1,71 @@ +import pandas as pd +import numpy as np + + +class TimeSeriesCV: + """Generates tuples of train_idx, test_idx pairs + Assumes the MultiIndex contains levels 'symbol' and 'date' + purges overlapping outcomes. Includes a shift for each test set.""" + + def __init__( + self, + n_splits=3, + train_period_length=126, + test_period_length=21, + lookahead=None, + shift_length=0, # New parameter to specify the shift length + date_idx='date', + shuffle=False, + seed=None, + ): + self.n_splits = n_splits + self.lookahead = lookahead + self.test_length = test_period_length + self.train_length = train_period_length + self.shift_length = shift_length # Store the shift length + self.shuffle = shuffle + self.seed = seed + self.date_idx = date_idx + + def split(self, X, y=None, groups=None): + unique_dates = X.index.get_level_values(self.date_idx).unique() + days = sorted(unique_dates, reverse=True) + + splits = [] + for i in range(self.n_splits): + # Adjust the end index for the test set to include the shift for subsequent splits + test_end_idx = i * self.test_length + i * self.shift_length + test_start_idx = test_end_idx + self.test_length + train_end_idx = test_start_idx + self.lookahead - 1 + train_start_idx = train_end_idx + self.train_length + self.lookahead - 1 + + if train_start_idx >= len(days): + break # Break if the start index goes beyond the available data + + dates = X.reset_index()[[self.date_idx]] + train_idx = dates[(dates[self.date_idx] > days[min(train_start_idx, len(days)-1)]) + & (dates[self.date_idx] <= days[min(train_end_idx, len(days)-1)])].index + test_idx = dates[(dates[self.date_idx] > days[min(test_start_idx, len(days)-1)]) + & (dates[self.date_idx] <= days[min(test_end_idx, len(days)-1)])].index + + if self.shuffle: + if self.seed is not None: + np.random.seed(self.seed) + + train_idx_list = list(train_idx) + np.random.shuffle(train_idx_list) + train_idx = np.array(train_idx_list) + else: + train_idx = train_idx.to_numpy() + + test_idx = test_idx.to_numpy() + + splits.append((train_idx, test_idx)) + + return splits + + def get_n_splits(self, X=None, y=None, groups=None): + """Adjusts the number of splits if there's not enough data for the desired configuration.""" + return self.n_splits + + \ No newline at end of file diff --git a/src/pytimetk/feature_engineering/timeseries_signature.py b/src/pytimetk/feature_engineering/timeseries_signature.py index 7625d560..58c73b87 100644 --- a/src/pytimetk/feature_engineering/timeseries_signature.py +++ b/src/pytimetk/feature_engineering/timeseries_signature.py @@ -9,22 +9,27 @@ from pytimetk.utils.checks import check_series_or_datetime, check_dataframe_or_groupby, check_date_column, check_value_column from pytimetk.utils.memory_helpers import reduce_memory_usage -@pf.register_series_method -def get_timeseries_signature( - idx: Union[pd.Series, pd.DatetimeIndex], +@pf.register_dataframe_method +def augment_timeseries_signature( + data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy], + date_column: str, reduce_memory: bool = False, - engine: str = 'pandas' - ) -> pd.DataFrame: + engine: str = 'pandas', +) -> pd.DataFrame: """ - Convert a timestamp to a set of 29 time series features. - - The function `get_timeseries_signature` engineers **29 different date and - time based features** from a single datetime index `idx`: - + The function `augment_timeseries_signature` takes a DataFrame and a date + column as input and returns the original DataFrame with the **29 different + date and time based features** added as new columns with the feature name + based on the date_column. + Parameters ---------- - idx : pd.DataFrame - The `idx` parameter is a pandas Series of DatetimeIndex. + data : pd.DataFrame + The `data` parameter is a pandas DataFrame that contains the time series + data. + date_column : str + The `date_column` parameter is a string that represents the name of the + date column in the `data` DataFrame. reduce_memory : bool, optional The `reduce_memory` parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False. engine : str, optional @@ -78,106 +83,78 @@ def get_timeseries_signature( import pandas as pd import pytimetk as tk - dates = pd.date_range(start = '2019-01', end = '2019-03', freq = 'D') - ``` - - ```{python} - # Makes 29 new time series features from the dates - tk.get_timeseries_signature(dates, engine='pandas').glimpse() + df = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date']) ``` ```{python} - tk.get_timeseries_signature(dates, engine='polars').glimpse() - ``` - ```{python} - pd.Series(dates, name = "date").get_timeseries_signature(engine='pandas').glimpse() + # Adds 29 new time series features as columns to the original DataFrame (pandas engine) + ( + df + .augment_timeseries_signature(date_column='order_date', engine ='pandas') + .glimpse() + ) ``` ```{python} - pd.Series(dates, name = "date").get_timeseries_signature(engine='polars').glimpse() + # Adds 29 new time series features as columns to the original DataFrame (polars engine) + ( + df + .augment_timeseries_signature(date_column='order_date', engine ='polars') + .glimpse() + ) ``` """ - # common checks - check_series_or_datetime(idx) + # Run common checks + check_dataframe_or_groupby(data) + check_date_column(data, date_column) - # If idx is a DatetimeIndex, convert to Series - if isinstance(idx, pd.DatetimeIndex): - idx = pd.Series(idx, name="idx") + if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy): + data = data.obj - # Check if idx is a Series - if not isinstance(idx, pd.Series): - raise TypeError('idx must be a pandas Series or DatetimeIndex object') + if reduce_memory: + data = reduce_memory_usage(data) if engine == 'pandas': - ret = _get_timeseries_signature_pandas(idx) + ret = pd.concat( + [ + data, + data[date_column].get_timeseries_signature(engine=engine).drop(date_column, axis=1) + ], + axis=1) elif engine == 'polars': - ret = _get_timeseries_signature_polars(idx) + + df_pl = pl.DataFrame(data) + + df_pl = _polars_timeseries_signature(df_pl, date_column = date_column) + + ret = df_pl.to_pandas() else: raise ValueError("Invalid engine. Use 'pandas' or 'polars'.") if reduce_memory: ret = reduce_memory_usage(ret) - + return ret -# Monkey patch the method to Pandas Series objects -pd.Series.get_timeseries_signature = get_timeseries_signature - -def _get_timeseries_signature_pandas(idx: Union[pd.Series, pd.DatetimeIndex]) -> pd.DataFrame: - - if isinstance(idx, pd.DatetimeIndex): - idx = pd.Series(idx, name='idx') - - if idx.name is None: - idx.name = 'idx' - - data = idx.to_frame() - name = idx.name - - data = _pandas_timeseries_signature(data, date_column = name) - - return data - -def _get_timeseries_signature_polars(idx: Union[pd.Series, pd.DatetimeIndex]) -> pl.DataFrame: - - if isinstance(idx, pd.DatetimeIndex): - idx = pd.Series(idx, name='idx') - - if idx.name is None: - idx.name = 'idx' - - data = idx.to_frame() - name = idx.name - - # Convert to Polars DataFrame - df_pl = pl.DataFrame(data) - - # Helper function that works with polars objects - df_pl = _polars_timeseries_signature(df_pl, date_column = name) - - return df_pl.to_pandas() +# Monkey patch the method to pandas groupby objects +pd.core.groupby.generic.DataFrameGroupBy.augment_timeseries_signature = augment_timeseries_signature -@pf.register_dataframe_method -def augment_timeseries_signature( - data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy], - date_column: str, +@pf.register_series_method +def get_timeseries_signature( + idx: Union[pd.Series, pd.DatetimeIndex], reduce_memory: bool = False, - engine: str = 'pandas', -) -> pd.DataFrame: + engine: str = 'pandas' + ) -> pd.DataFrame: """ - The function `augment_timeseries_signature` takes a DataFrame and a date - column as input and returns the original DataFrame with the **29 different - date and time based features** added as new columns with the feature name - based on the date_column. - + Convert a timestamp to a set of 29 time series features. + + The function `get_timeseries_signature` engineers **29 different date and + time based features** from a single datetime index `idx`: + Parameters ---------- - data : pd.DataFrame - The `data` parameter is a pandas DataFrame that contains the time series - data. - date_column : str - The `date_column` parameter is a string that represents the name of the - date column in the `data` DataFrame. + idx : pd.DataFrame + The `idx` parameter is a pandas Series of DatetimeIndex. reduce_memory : bool, optional The `reduce_memory` parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False. engine : str, optional @@ -231,61 +208,86 @@ def augment_timeseries_signature( import pandas as pd import pytimetk as tk - df = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date']) + dates = pd.date_range(start = '2019-01', end = '2019-03', freq = 'D') ``` ```{python} - # Adds 29 new time series features as columns to the original DataFrame (pandas engine) - ( - df - .augment_timeseries_signature(date_column='order_date', engine ='pandas') - .glimpse() - ) + # Makes 29 new time series features from the dates + tk.get_timeseries_signature(dates, engine='pandas').glimpse() ``` ```{python} - # Adds 29 new time series features as columns to the original DataFrame (polars engine) - ( - df - .augment_timeseries_signature(date_column='order_date', engine ='polars') - .glimpse() - ) + tk.get_timeseries_signature(dates, engine='polars').glimpse() + ``` + ```{python} + pd.Series(dates, name = "date").get_timeseries_signature(engine='pandas').glimpse() + ``` + + ```{python} + pd.Series(dates, name = "date").get_timeseries_signature(engine='polars').glimpse() ``` """ - # Run common checks - check_dataframe_or_groupby(data) - check_date_column(data, date_column) + # common checks + check_series_or_datetime(idx) - if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy): - data = data.obj + # If idx is a DatetimeIndex, convert to Series + if isinstance(idx, pd.DatetimeIndex): + idx = pd.Series(idx, name="idx") - if reduce_memory: - data = reduce_memory_usage(data) + # Check if idx is a Series + if not isinstance(idx, pd.Series): + raise TypeError('idx must be a pandas Series or DatetimeIndex object') if engine == 'pandas': - ret = pd.concat( - [ - data, - data[date_column].get_timeseries_signature(engine=engine).drop(date_column, axis=1) - ], - axis=1) + ret = _get_timeseries_signature_pandas(idx) elif engine == 'polars': - - df_pl = pl.DataFrame(data) - - df_pl = _polars_timeseries_signature(df_pl, date_column = date_column) - - ret = df_pl.to_pandas() + ret = _get_timeseries_signature_polars(idx) else: raise ValueError("Invalid engine. Use 'pandas' or 'polars'.") if reduce_memory: ret = reduce_memory_usage(ret) - + return ret -# Monkey patch the method to pandas groupby objects -pd.core.groupby.generic.DataFrameGroupBy.augment_timeseries_signature = augment_timeseries_signature +# Monkey patch the method to Pandas Series objects +pd.Series.get_timeseries_signature = get_timeseries_signature + +def _get_timeseries_signature_pandas(idx: Union[pd.Series, pd.DatetimeIndex]) -> pd.DataFrame: + + if isinstance(idx, pd.DatetimeIndex): + idx = pd.Series(idx, name='idx') + + if idx.name is None: + idx.name = 'idx' + + data = idx.to_frame() + name = idx.name + + data = _pandas_timeseries_signature(data, date_column = name) + + return data + +def _get_timeseries_signature_polars(idx: Union[pd.Series, pd.DatetimeIndex]) -> pl.DataFrame: + + if isinstance(idx, pd.DatetimeIndex): + idx = pd.Series(idx, name='idx') + + if idx.name is None: + idx.name = 'idx' + + data = idx.to_frame() + name = idx.name + + # Convert to Polars DataFrame + df_pl = pl.DataFrame(data) + + # Helper function that works with polars objects + df_pl = _polars_timeseries_signature(df_pl, date_column = name) + + return df_pl.to_pandas() + + # UTILITIES # ---------