Skip to content

Commit

Permalink
pytimetk 0.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mdancho84 committed Mar 18, 2024
1 parent f1b64dc commit 7e2c885
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 123 deletions.
Binary file removed dist/pytimetk-0.3.0.9004.tar.gz
Binary file not shown.
Binary file not shown.
Binary file added dist/pytimetk-0.4.0.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pytimetk"
version = "0.3.0.9004"
version = "0.4.0"
description = "The time series toolkit for Python."
authors = [
"Business Science <[email protected]>",
Expand Down
71 changes: 71 additions & 0 deletions src/pytimetk/crossvalidation/time_series_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import pandas as pd
import numpy as np


class TimeSeriesCV:
"""Generates tuples of train_idx, test_idx pairs
Assumes the MultiIndex contains levels 'symbol' and 'date'
purges overlapping outcomes. Includes a shift for each test set."""

def __init__(
self,
n_splits=3,
train_period_length=126,
test_period_length=21,
lookahead=None,
shift_length=0, # New parameter to specify the shift length
date_idx='date',
shuffle=False,
seed=None,
):
self.n_splits = n_splits
self.lookahead = lookahead
self.test_length = test_period_length
self.train_length = train_period_length
self.shift_length = shift_length # Store the shift length
self.shuffle = shuffle
self.seed = seed
self.date_idx = date_idx

def split(self, X, y=None, groups=None):
unique_dates = X.index.get_level_values(self.date_idx).unique()
days = sorted(unique_dates, reverse=True)

splits = []
for i in range(self.n_splits):
# Adjust the end index for the test set to include the shift for subsequent splits
test_end_idx = i * self.test_length + i * self.shift_length
test_start_idx = test_end_idx + self.test_length
train_end_idx = test_start_idx + self.lookahead - 1
train_start_idx = train_end_idx + self.train_length + self.lookahead - 1

if train_start_idx >= len(days):
break # Break if the start index goes beyond the available data

dates = X.reset_index()[[self.date_idx]]
train_idx = dates[(dates[self.date_idx] > days[min(train_start_idx, len(days)-1)])
& (dates[self.date_idx] <= days[min(train_end_idx, len(days)-1)])].index
test_idx = dates[(dates[self.date_idx] > days[min(test_start_idx, len(days)-1)])
& (dates[self.date_idx] <= days[min(test_end_idx, len(days)-1)])].index

if self.shuffle:
if self.seed is not None:
np.random.seed(self.seed)

train_idx_list = list(train_idx)
np.random.shuffle(train_idx_list)
train_idx = np.array(train_idx_list)
else:
train_idx = train_idx.to_numpy()

test_idx = test_idx.to_numpy()

splits.append((train_idx, test_idx))

return splits

def get_n_splits(self, X=None, y=None, groups=None):
"""Adjusts the number of splits if there's not enough data for the desired configuration."""
return self.n_splits


246 changes: 124 additions & 122 deletions src/pytimetk/feature_engineering/timeseries_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,27 @@
from pytimetk.utils.checks import check_series_or_datetime, check_dataframe_or_groupby, check_date_column, check_value_column
from pytimetk.utils.memory_helpers import reduce_memory_usage

@pf.register_series_method
def get_timeseries_signature(
idx: Union[pd.Series, pd.DatetimeIndex],
@pf.register_dataframe_method
def augment_timeseries_signature(
data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy],
date_column: str,
reduce_memory: bool = False,
engine: str = 'pandas'
) -> pd.DataFrame:
engine: str = 'pandas',
) -> pd.DataFrame:
"""
Convert a timestamp to a set of 29 time series features.
The function `get_timeseries_signature` engineers **29 different date and
time based features** from a single datetime index `idx`:
The function `augment_timeseries_signature` takes a DataFrame and a date
column as input and returns the original DataFrame with the **29 different
date and time based features** added as new columns with the feature name
based on the date_column.
Parameters
----------
idx : pd.DataFrame
The `idx` parameter is a pandas Series of DatetimeIndex.
data : pd.DataFrame
The `data` parameter is a pandas DataFrame that contains the time series
data.
date_column : str
The `date_column` parameter is a string that represents the name of the
date column in the `data` DataFrame.
reduce_memory : bool, optional
The `reduce_memory` parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.
engine : str, optional
Expand Down Expand Up @@ -78,106 +83,78 @@ def get_timeseries_signature(
import pandas as pd
import pytimetk as tk
dates = pd.date_range(start = '2019-01', end = '2019-03', freq = 'D')
```
```{python}
# Makes 29 new time series features from the dates
tk.get_timeseries_signature(dates, engine='pandas').glimpse()
df = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])
```
```{python}
tk.get_timeseries_signature(dates, engine='polars').glimpse()
```
```{python}
pd.Series(dates, name = "date").get_timeseries_signature(engine='pandas').glimpse()
# Adds 29 new time series features as columns to the original DataFrame (pandas engine)
(
df
.augment_timeseries_signature(date_column='order_date', engine ='pandas')
.glimpse()
)
```
```{python}
pd.Series(dates, name = "date").get_timeseries_signature(engine='polars').glimpse()
# Adds 29 new time series features as columns to the original DataFrame (polars engine)
(
df
.augment_timeseries_signature(date_column='order_date', engine ='polars')
.glimpse()
)
```
"""
# common checks
check_series_or_datetime(idx)
# Run common checks
check_dataframe_or_groupby(data)
check_date_column(data, date_column)

# If idx is a DatetimeIndex, convert to Series
if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name="idx")
if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy):
data = data.obj

# Check if idx is a Series
if not isinstance(idx, pd.Series):
raise TypeError('idx must be a pandas Series or DatetimeIndex object')
if reduce_memory:
data = reduce_memory_usage(data)

if engine == 'pandas':
ret = _get_timeseries_signature_pandas(idx)
ret = pd.concat(
[
data,
data[date_column].get_timeseries_signature(engine=engine).drop(date_column, axis=1)
],
axis=1)
elif engine == 'polars':
ret = _get_timeseries_signature_polars(idx)

df_pl = pl.DataFrame(data)

df_pl = _polars_timeseries_signature(df_pl, date_column = date_column)

ret = df_pl.to_pandas()
else:
raise ValueError("Invalid engine. Use 'pandas' or 'polars'.")

if reduce_memory:
ret = reduce_memory_usage(ret)

return ret

# Monkey patch the method to Pandas Series objects
pd.Series.get_timeseries_signature = get_timeseries_signature

def _get_timeseries_signature_pandas(idx: Union[pd.Series, pd.DatetimeIndex]) -> pd.DataFrame:

if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name='idx')

if idx.name is None:
idx.name = 'idx'

data = idx.to_frame()
name = idx.name

data = _pandas_timeseries_signature(data, date_column = name)

return data

def _get_timeseries_signature_polars(idx: Union[pd.Series, pd.DatetimeIndex]) -> pl.DataFrame:

if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name='idx')

if idx.name is None:
idx.name = 'idx'

data = idx.to_frame()
name = idx.name

# Convert to Polars DataFrame
df_pl = pl.DataFrame(data)

# Helper function that works with polars objects
df_pl = _polars_timeseries_signature(df_pl, date_column = name)

return df_pl.to_pandas()
# Monkey patch the method to pandas groupby objects
pd.core.groupby.generic.DataFrameGroupBy.augment_timeseries_signature = augment_timeseries_signature

@pf.register_dataframe_method
def augment_timeseries_signature(
data: Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy],
date_column: str,
@pf.register_series_method
def get_timeseries_signature(
idx: Union[pd.Series, pd.DatetimeIndex],
reduce_memory: bool = False,
engine: str = 'pandas',
) -> pd.DataFrame:
engine: str = 'pandas'
) -> pd.DataFrame:
"""
The function `augment_timeseries_signature` takes a DataFrame and a date
column as input and returns the original DataFrame with the **29 different
date and time based features** added as new columns with the feature name
based on the date_column.
Convert a timestamp to a set of 29 time series features.
The function `get_timeseries_signature` engineers **29 different date and
time based features** from a single datetime index `idx`:
Parameters
----------
data : pd.DataFrame
The `data` parameter is a pandas DataFrame that contains the time series
data.
date_column : str
The `date_column` parameter is a string that represents the name of the
date column in the `data` DataFrame.
idx : pd.DataFrame
The `idx` parameter is a pandas Series of DatetimeIndex.
reduce_memory : bool, optional
The `reduce_memory` parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.
engine : str, optional
Expand Down Expand Up @@ -231,61 +208,86 @@ def augment_timeseries_signature(
import pandas as pd
import pytimetk as tk
df = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])
dates = pd.date_range(start = '2019-01', end = '2019-03', freq = 'D')
```
```{python}
# Adds 29 new time series features as columns to the original DataFrame (pandas engine)
(
df
.augment_timeseries_signature(date_column='order_date', engine ='pandas')
.glimpse()
)
# Makes 29 new time series features from the dates
tk.get_timeseries_signature(dates, engine='pandas').glimpse()
```
```{python}
# Adds 29 new time series features as columns to the original DataFrame (polars engine)
(
df
.augment_timeseries_signature(date_column='order_date', engine ='polars')
.glimpse()
)
tk.get_timeseries_signature(dates, engine='polars').glimpse()
```
```{python}
pd.Series(dates, name = "date").get_timeseries_signature(engine='pandas').glimpse()
```
```{python}
pd.Series(dates, name = "date").get_timeseries_signature(engine='polars').glimpse()
```
"""
# Run common checks
check_dataframe_or_groupby(data)
check_date_column(data, date_column)
# common checks
check_series_or_datetime(idx)

if isinstance(data, pd.core.groupby.generic.DataFrameGroupBy):
data = data.obj
# If idx is a DatetimeIndex, convert to Series
if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name="idx")

if reduce_memory:
data = reduce_memory_usage(data)
# Check if idx is a Series
if not isinstance(idx, pd.Series):
raise TypeError('idx must be a pandas Series or DatetimeIndex object')

if engine == 'pandas':
ret = pd.concat(
[
data,
data[date_column].get_timeseries_signature(engine=engine).drop(date_column, axis=1)
],
axis=1)
ret = _get_timeseries_signature_pandas(idx)
elif engine == 'polars':

df_pl = pl.DataFrame(data)

df_pl = _polars_timeseries_signature(df_pl, date_column = date_column)

ret = df_pl.to_pandas()
ret = _get_timeseries_signature_polars(idx)
else:
raise ValueError("Invalid engine. Use 'pandas' or 'polars'.")

if reduce_memory:
ret = reduce_memory_usage(ret)

return ret

# Monkey patch the method to pandas groupby objects
pd.core.groupby.generic.DataFrameGroupBy.augment_timeseries_signature = augment_timeseries_signature
# Monkey patch the method to Pandas Series objects
pd.Series.get_timeseries_signature = get_timeseries_signature

def _get_timeseries_signature_pandas(idx: Union[pd.Series, pd.DatetimeIndex]) -> pd.DataFrame:

if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name='idx')

if idx.name is None:
idx.name = 'idx'

data = idx.to_frame()
name = idx.name

data = _pandas_timeseries_signature(data, date_column = name)

return data

def _get_timeseries_signature_polars(idx: Union[pd.Series, pd.DatetimeIndex]) -> pl.DataFrame:

if isinstance(idx, pd.DatetimeIndex):
idx = pd.Series(idx, name='idx')

if idx.name is None:
idx.name = 'idx'

data = idx.to_frame()
name = idx.name

# Convert to Polars DataFrame
df_pl = pl.DataFrame(data)

# Helper function that works with polars objects
df_pl = _polars_timeseries_signature(df_pl, date_column = name)

return df_pl.to_pandas()



# UTILITIES
# ---------
Expand Down

0 comments on commit 7e2c885

Please sign in to comment.