From 722f6b54bf3eb6f0822f7b6535705024dc8df7ea Mon Sep 17 00:00:00 2001 From: Matt Dancho Date: Tue, 5 Nov 2024 23:12:06 -0500 Subject: [PATCH] update docs --- docs/_site/404.html | 6 + docs/_site/changelog-news.html | 6 + docs/_site/contributing.html | 6 + .../getting-started/01_installation.html | 6 + .../_site/getting-started/02_quick_start.html | 6 + docs/_site/guides/01_visualization.html | 6 + docs/_site/guides/02_timetk_concepts.html | 6 + docs/_site/guides/03_pandas_frequency.html | 6 + docs/_site/guides/04_wrangling.html | 6 + docs/_site/guides/05_augmenting.html | 6 + docs/_site/guides/06_anomalize.html | 12 +- .../guides/07_timeseries_crossvalidation.html | 1313 +++++++++++++++++ docs/_site/index.html | 6 + .../performance/01_speed_comparisons.html | 6 + docs/_site/reference/TimeSeriesCV.html | 6 + .../_site/reference/TimeSeriesCVSplitter.html | 6 + docs/_site/reference/anomalize.html | 6 + docs/_site/reference/apply_by_time.html | 6 + docs/_site/reference/augment_atr.html | 6 + docs/_site/reference/augment_bbands.html | 6 + docs/_site/reference/augment_cmo.html | 6 + docs/_site/reference/augment_diffs.html | 6 + docs/_site/reference/augment_ewm.html | 6 + docs/_site/reference/augment_expanding.html | 6 + .../reference/augment_expanding_apply.html | 6 + docs/_site/reference/augment_fourier.html | 6 + docs/_site/reference/augment_hilbert.html | 6 + .../reference/augment_holiday_signature.html | 6 + docs/_site/reference/augment_lags.html | 6 + docs/_site/reference/augment_leads.html | 6 + docs/_site/reference/augment_macd.html | 6 + docs/_site/reference/augment_pct_change.html | 6 + docs/_site/reference/augment_ppo.html | 6 + docs/_site/reference/augment_qsmomentum.html | 6 + docs/_site/reference/augment_roc.html | 6 + docs/_site/reference/augment_rolling.html | 6 + .../reference/augment_rolling_apply.html | 6 + docs/_site/reference/augment_rsi.html | 6 + .../augment_timeseries_signature.html | 6 + docs/_site/reference/augment_wavelet.html | 6 + docs/_site/reference/binarize.html | 6 + docs/_site/reference/ceil_date.html | 6 + docs/_site/reference/correlate.html | 6 + docs/_site/reference/drop_zero_variance.html | 6 + docs/_site/reference/filter_by_time.html | 6 + .../flatten_multiindex_column_names.html | 6 + docs/_site/reference/floor_date.html | 6 + docs/_site/reference/future_frame.html | 6 + .../reference/get_available_datasets.html | 6 + docs/_site/reference/get_date_summary.html | 6 + docs/_site/reference/get_diff_summary.html | 6 + docs/_site/reference/get_frequency.html | 6 + .../reference/get_frequency_summary.html | 6 + .../reference/get_holiday_signature.html | 6 + .../_site/reference/get_pandas_frequency.html | 6 + .../reference/get_seasonal_frequency.html | 6 + .../reference/get_timeseries_signature.html | 6 + docs/_site/reference/get_trend_frequency.html | 6 + docs/_site/reference/glimpse.html | 6 + docs/_site/reference/index.html | 6 + docs/_site/reference/is_holiday.html | 6 + docs/_site/reference/load_dataset.html | 6 + .../reference/make_future_timeseries.html | 6 + .../reference/make_weekday_sequence.html | 6 + .../reference/make_weekend_sequence.html | 6 + docs/_site/reference/pad_by_time.html | 6 + docs/_site/reference/palette_timetk.html | 6 + docs/_site/reference/parallel_apply.html | 6 + docs/_site/reference/plot_anomalies.html | 6 + .../reference/plot_anomalies_cleaned.html | 6 + .../reference/plot_anomalies_decomp.html | 6 + docs/_site/reference/plot_anomaly_decomp.html | 6 + .../reference/plot_correlation_funnel.html | 6 + docs/_site/reference/plot_timeseries.html | 6 + docs/_site/reference/progress_apply.html | 6 + docs/_site/reference/summarize_by_time.html | 6 + docs/_site/reference/theme_timetk.html | 6 + docs/_site/reference/time_scale_template.html | 6 + .../timeseries_unit_frequency_table.html | 6 + docs/_site/reference/transform_columns.html | 6 + docs/_site/reference/ts_features.html | 6 + docs/_site/reference/ts_summary.html | 6 + docs/_site/reference/week_of_month.html | 6 + docs/_site/search.json | 221 ++- docs/_site/sitemap.xml | 192 +-- docs/_site/tutorials/01_sales_crm.html | 12 +- docs/_site/tutorials/02_finance.html | 6 + .../tutorials/03_demand_forecasting.html | 6 + .../_site/tutorials/04_anomaly_detection.html | 6 + docs/_site/tutorials/05_clustering.html | 6 + .../_site/tutorials/06_correlationfunnel.html | 6 + docs/guides/07_timeseries_crossvalidation.qmd | 182 ++- 92 files changed, 2248 insertions(+), 200 deletions(-) create mode 100644 docs/_site/guides/07_timeseries_crossvalidation.html diff --git a/docs/_site/404.html b/docs/_site/404.html index e2ca6b48..25c6c31f 100644 --- a/docs/_site/404.html +++ b/docs/_site/404.html @@ -204,6 +204,12 @@ Anomaly Detection + + diff --git a/docs/_site/changelog-news.html b/docs/_site/changelog-news.html index 1a9c276d..55bcd52f 100644 --- a/docs/_site/changelog-news.html +++ b/docs/_site/changelog-news.html @@ -204,6 +204,12 @@ Anomaly Detection + + diff --git a/docs/_site/contributing.html b/docs/_site/contributing.html index 17ea6755..6aeb4a25 100644 --- a/docs/_site/contributing.html +++ b/docs/_site/contributing.html @@ -238,6 +238,12 @@ Anomaly Detection + + diff --git a/docs/_site/getting-started/01_installation.html b/docs/_site/getting-started/01_installation.html index 50aec1e8..735e0a2e 100644 --- a/docs/_site/getting-started/01_installation.html +++ b/docs/_site/getting-started/01_installation.html @@ -239,6 +239,12 @@ Anomaly Detection + + diff --git a/docs/_site/getting-started/02_quick_start.html b/docs/_site/getting-started/02_quick_start.html index dc8a0155..08e7059c 100644 --- a/docs/_site/getting-started/02_quick_start.html +++ b/docs/_site/getting-started/02_quick_start.html @@ -259,6 +259,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/01_visualization.html b/docs/_site/guides/01_visualization.html index c2e76f2f..b4f96b8d 100644 --- a/docs/_site/guides/01_visualization.html +++ b/docs/_site/guides/01_visualization.html @@ -265,6 +265,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/02_timetk_concepts.html b/docs/_site/guides/02_timetk_concepts.html index 20e5848a..35871600 100644 --- a/docs/_site/guides/02_timetk_concepts.html +++ b/docs/_site/guides/02_timetk_concepts.html @@ -244,6 +244,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/03_pandas_frequency.html b/docs/_site/guides/03_pandas_frequency.html index 4c1f8011..2fd286cf 100644 --- a/docs/_site/guides/03_pandas_frequency.html +++ b/docs/_site/guides/03_pandas_frequency.html @@ -243,6 +243,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/04_wrangling.html b/docs/_site/guides/04_wrangling.html index 3de70e36..7c4f1874 100644 --- a/docs/_site/guides/04_wrangling.html +++ b/docs/_site/guides/04_wrangling.html @@ -244,6 +244,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/05_augmenting.html b/docs/_site/guides/05_augmenting.html index 35249864..366772f5 100644 --- a/docs/_site/guides/05_augmenting.html +++ b/docs/_site/guides/05_augmenting.html @@ -260,6 +260,12 @@ Anomaly Detection + + diff --git a/docs/_site/guides/06_anomalize.html b/docs/_site/guides/06_anomalize.html index 3037e408..af8c7136 100644 --- a/docs/_site/guides/06_anomalize.html +++ b/docs/_site/guides/06_anomalize.html @@ -63,7 +63,7 @@ - + @@ -259,6 +259,12 @@ Anomaly Detection + + @@ -1243,8 +1249,8 @@

2 More Coming Soo diff --git a/docs/_site/guides/07_timeseries_crossvalidation.html b/docs/_site/guides/07_timeseries_crossvalidation.html new file mode 100644 index 00000000..3d76785f --- /dev/null +++ b/docs/_site/guides/07_timeseries_crossvalidation.html @@ -0,0 +1,1313 @@ + + + + + + + + + +pytimetk - Time Series Cross Validation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ +
+
+

Time Series Cross Validation

+
+ + + +
+ + + + +
+ + +
+ +
+

1 Time-Based Cross-Validation Using TimeSeriesCV and TimeSeriesCVSplitter

+

In this tutorial, youā€™ll learn how to use the TimeSeriesCV and TimeSeriesCVSplitter classes from pytimetk for time series cross-validation, using the walmart_sales_df dataset as an example, which contains 7 time series groups.

+
    +
  1. In Part 1, weā€™ll start with exploring the data and move on to creating and visualizing time-based cross-validation splits. This will prepare you for the next section with Scikit Learn.

  2. +
  3. In Part 2, weā€™ll implement time series cross-validation with Scikit-Learn, engineer features, train a random forest model, and visualize the results in Python. By following this process, you can ensure a robust evaluation of your time series models and gain insights into their predictive performance.

  4. +
+
+
+

2 Part 1: Getting Started with TimeSeriesCV

+

TimeSeriesCV is used to generate many time series splits (or folds) for use in modeling and resampling with one or more time series groups contained in the data.

+
+
+
+ +
+
+Using with Scikit Learn +
+
+
+
+
+

If you are wanting a drop-in replacement for Scikit Learnā€™s TimeSeriesSplit, please use TimeSeriesCVSplitter() discussed next. The splitter uses TimeSeriesCV under the hood.

+
+
+
+
+

2.1 Step 1: Load and Explore the Data

+

First, letā€™s load the Walmart sales dataset and explore its structure:

+
+
# libraries
+import pytimetk as tk
+import pandas as pd
+import numpy as np
+
+# Import Data
+walmart_sales_df = tk.load_dataset('walmart_sales_weekly')
+
+walmart_sales_df['Date'] = pd.to_datetime(walmart_sales_df['Date'])
+
+walmart_sales_df = walmart_sales_df[['id', 'Date', 'Weekly_Sales']]
+
+walmart_sales_df.glimpse()
+
+
<class 'pandas.core.frame.DataFrame'>: 1001 rows of 3 columns
+id:            object            ['1_1', '1_1', '1_1', '1_1', '1_1', '1_ ...
+Date:          datetime64[ns]    [Timestamp('2010-02-05 00:00:00'), Time ...
+Weekly_Sales:  float64           [24924.5, 46039.49, 41595.55, 19403.54, ...
+
+
+
+
+

2.2 Step 2: Visualize the Time Series Data

+

We can visualize the weekly sales data for different store IDs using the plot_timeseries method from pytimetk:

+
+
walmart_sales_df \
+    .groupby('id') \
+    .plot_timeseries(
+        "Date", "Weekly_Sales",
+        plotly_dropdown = True,
+    )
+
+ +
+
+
+

This will generate an interactive time series plot, allowing you to explore sales data for different stores using a dropdown.

+
+
+

2.3 Step 3: Set Up TimeSeriesCV for Cross-Validation

+

Now, letā€™s set up a time-based cross-validation scheme using TimeSeriesCV:

+
+
from pytimetk.crossvalidation import TimeSeriesCV
+
+# Define parameters for TimeSeriesCV
+tscv = TimeSeriesCV(
+    frequency="weeks",
+    train_size=52,          # Use 52 weeks for training
+    forecast_horizon=12,    # Forecast 12 weeks ahead
+    gap=0,                  # No gap between training and forecast sets
+    stride=4,               # Move forward by 4 weeks after each split
+    window="rolling",       # Use a rolling window
+    mode="backward"         # Generate splits from end to start
+)
+
+# Glimpse the cross-validation splits
+tscv.glimpse(
+    walmart_sales_df['Weekly_Sales'], 
+    time_series=walmart_sales_df['Date']
+)
+
+
Split Number: 1
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-08-05 00:00:00 to 2012-07-27 00:00:00
+Forecast Period: 2012-08-03 00:00:00 to 2012-10-19 00:00:00
+
+Split Number: 2
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-07-08 00:00:00 to 2012-06-29 00:00:00
+Forecast Period: 2012-07-06 00:00:00 to 2012-09-21 00:00:00
+
+Split Number: 3
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-06-10 00:00:00 to 2012-06-01 00:00:00
+Forecast Period: 2012-06-08 00:00:00 to 2012-08-24 00:00:00
+
+Split Number: 4
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-05-13 00:00:00 to 2012-05-04 00:00:00
+Forecast Period: 2012-05-11 00:00:00 to 2012-07-27 00:00:00
+
+Split Number: 5
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-04-15 00:00:00 to 2012-04-06 00:00:00
+Forecast Period: 2012-04-13 00:00:00 to 2012-06-29 00:00:00
+
+Split Number: 6
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-03-18 00:00:00 to 2012-03-09 00:00:00
+Forecast Period: 2012-03-16 00:00:00 to 2012-06-01 00:00:00
+
+Split Number: 7
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-02-18 00:00:00 to 2012-02-10 00:00:00
+Forecast Period: 2012-02-17 00:00:00 to 2012-05-04 00:00:00
+
+Split Number: 8
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2011-01-21 00:00:00 to 2012-01-13 00:00:00
+Forecast Period: 2012-01-20 00:00:00 to 2012-04-06 00:00:00
+
+Split Number: 9
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-12-24 00:00:00 to 2011-12-16 00:00:00
+Forecast Period: 2011-12-23 00:00:00 to 2012-03-09 00:00:00
+
+Split Number: 10
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-11-26 00:00:00 to 2011-11-18 00:00:00
+Forecast Period: 2011-11-25 00:00:00 to 2012-02-10 00:00:00
+
+Split Number: 11
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-10-29 00:00:00 to 2011-10-21 00:00:00
+Forecast Period: 2011-10-28 00:00:00 to 2012-01-13 00:00:00
+
+Split Number: 12
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-10-01 00:00:00 to 2011-09-23 00:00:00
+Forecast Period: 2011-09-30 00:00:00 to 2011-12-16 00:00:00
+
+Split Number: 13
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-09-03 00:00:00 to 2011-08-26 00:00:00
+Forecast Period: 2011-09-02 00:00:00 to 2011-11-18 00:00:00
+
+Split Number: 14
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-08-06 00:00:00 to 2011-07-29 00:00:00
+Forecast Period: 2011-08-05 00:00:00 to 2011-10-21 00:00:00
+
+Split Number: 15
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-07-09 00:00:00 to 2011-07-01 00:00:00
+Forecast Period: 2011-07-08 00:00:00 to 2011-09-23 00:00:00
+
+Split Number: 16
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-06-11 00:00:00 to 2011-06-03 00:00:00
+Forecast Period: 2011-06-10 00:00:00 to 2011-08-26 00:00:00
+
+Split Number: 17
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-05-14 00:00:00 to 2011-05-06 00:00:00
+Forecast Period: 2011-05-13 00:00:00 to 2011-07-29 00:00:00
+
+Split Number: 18
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-04-16 00:00:00 to 2011-04-08 00:00:00
+Forecast Period: 2011-04-15 00:00:00 to 2011-07-01 00:00:00
+
+Split Number: 19
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-03-19 00:00:00 to 2011-03-11 00:00:00
+Forecast Period: 2011-03-18 00:00:00 to 2011-06-03 00:00:00
+
+Split Number: 20
+Train Shape: (364,), Forecast Shape: (84,)
+Train Period: 2010-02-19 00:00:00 to 2011-02-11 00:00:00
+Forecast Period: 2011-02-18 00:00:00 to 2011-05-06 00:00:00
+
+
+
+

The glimpse method provides a summary of each cross-validation fold, including the start and end dates of the training and forecast periods.

+
+
+

2.4 Step 4: Plot the Cross-Validation Splits

+

You can visualize how the data is split for training and testing:

+
+
# Plot the cross-validation splits
+tscv.plot(
+    walmart_sales_df['Weekly_Sales'], 
+    time_series=walmart_sales_df['Date']
+)
+
+ +
+
+
+

This plot will show each fold, illustrating which weeks are used for training and which weeks are used for forecasting.

+
+
+
+

3 Part 2: Using TimeSeriesCVSplitter for Model Evaluation with Scikit Learn

+

When evaluating a modelā€™s predictive performance on time series data, we need to split the data in a way that respects the order of time within the Scikit Learn framework. We use a custom splitter, TimeSeriesCVSplitter, from the pytimetk library to handle this.

+
+

3.1 Step 1: Setting Up the TimeSeriesCVSplitter

+

The TimeSeriesCVSplitter helps us divide our dataset into training and forecast sets in a rolling window fashion. Hereā€™s how we configure it:

+
+
from pytimetk.crossvalidation import TimeSeriesCVSplitter
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import cross_val_score
+
+# Set up TimeSeriesCVSplitter
+cv_splitter = TimeSeriesCVSplitter(
+    time_series=walmart_sales_df['Date'],
+    frequency="weeks",
+    train_size=52*2,
+    forecast_horizon=12,
+    gap=0,
+    stride=4,
+    window="rolling",
+    mode="backward",
+    split_limit = 5
+)
+
+# Visualize the TSCV Strategy
+cv_splitter.splitter.plot(walmart_sales_df['Weekly_Sales'], walmart_sales_df['Date'])
+
+ +
+
+
+

The TimeSeriesCVSplitter creates multiple splits of the time series data, allowing us to validate the model across different periods. By visualizing the cross-validation strategy, we can see how the training and forecast sets are structured.

+
+
+

3.2 Step 2: Feature Engineering for Time Series Data

+

Effective feature engineering can significantly impact the performance of a time series model. Using pytimetk, we extract a variety of features from the Date column.

+
+

Generating Time Series Features

+

We use get_timeseries_signature to generate useful features, such as year, quarter, month, and day-of-week indicators.

+
+
# Prepare data for modeling
+
+# Extract time series features from the 'Date' column
+X_time_features = tk.get_timeseries_signature(walmart_sales_df['Date'])
+
+# Select features to dummy encode
+features_to_dummy = ['Date_quarteryear', 'Date_month_lbl', 'Date_wday_lbl', 'Date_am_pm']
+
+# Dummy encode the selected features
+X_time_dummies = pd.get_dummies(X_time_features[features_to_dummy], drop_first=True)
+
+# Dummy encode the 'id' column
+X_id_dummies = pd.get_dummies(walmart_sales_df['id'], prefix='store')
+
+# Combine the time series features, dummy-encoded features, and the 'id' dummies
+X = pd.concat([X_time_features, X_time_dummies, X_id_dummies], axis=1)
+
+# Drop the original categorical columns that were dummy encoded
+X = X.drop(columns=features_to_dummy).drop('Date', axis=1)
+
+# Set the target variable
+y = walmart_sales_df['Weekly_Sales'].values
+
+
+
+
+

3.3 Step 3: Model Training and Evaluation with Random Forest

+

For this example, we use RandomForestRegressor from scikit-learn to model the time series data. A random forest is a robust, ensemble-based model that can handle a wide range of regression tasks.

+
+
# Initialize the RandomForestRegressor model
+model = RandomForestRegressor(
+    n_estimators=100,      # Number of trees in the forest
+    max_depth=None,        # Maximum depth of the trees (None means nodes are expanded until all leaves are pure)
+    random_state=42        # Set a random state for reproducibility
+)
+
+# Evaluate the model using cross-validation scores
+scores = cross_val_score(model, X, y, cv=cv_splitter, scoring='neg_mean_squared_error')
+
+# Print cross-validation scores
+print("Cross-Validation Scores (Negative MSE):", scores)
+
+
Cross-Validation Scores (Negative MSE): [-23761708.80112538 -23107644.58461143 -21728878.18790144
+ -25113860.93913386 -86192034.48953015]
+
+
+
+
+

3.4 Step 4: Visualizing the Forecast

+

Visualization is crucial to understand how well the model predicts future values. We collect the actual and predicted values for each fold and combine them for easy plotting.

+
+
# Lists to store the combined data
+combined_data = []
+
+# Iterate through each fold and collect the data
+for i, (train_index, test_index) in enumerate(cv_splitter.split(X, y), start=1):
+    # Get the training and forecast data from the original DataFrame
+    train_df = walmart_sales_df.iloc[train_index].copy()
+    test_df = walmart_sales_df.iloc[test_index].copy()
+    
+    # Fit the model on the training data
+    model.fit(X.iloc[train_index], y[train_index])
+    
+    # Predict on the test set
+    y_pred = model.predict(X.iloc[test_index])
+    
+    # Add the actual and predicted values
+    train_df['Actual'] = y[train_index]
+    train_df['Predicted'] = None  # No predictions for training data
+    train_df['Fold'] = i  # Indicate the current fold
+    
+    test_df['Actual'] = y[test_index]
+    test_df['Predicted'] = y_pred  # Predictions for the test data
+    test_df['Fold'] = i  # Indicate the current fold
+    
+    # Append both the training and forecast DataFrames to the combined data list
+    combined_data.extend([train_df, test_df])
+
+# Combine all the data into a single DataFrame
+full_forecast_df = pd.concat(combined_data, ignore_index=True)
+
+full_forecast_df = full_forecast_df[['id', 'Date', 'Actual', 'Predicted', 'Fold']]
+
+full_forecast_df.glimpse()
+
+
<class 'pandas.core.frame.DataFrame'>: 4060 rows of 5 columns
+id:         object            ['1_1', '1_1', '1_1', '1_1', '1_1', '1_1', ...
+Date:       datetime64[ns]    [Timestamp('2010-08-06 00:00:00'), Timesta ...
+Actual:     float64           [17508.41, 15536.4, 15740.13, 15793.87, 16 ...
+Predicted:  float64           [nan, nan, nan, nan, nan, nan, nan, nan, n ...
+Fold:       int64             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+
+
+
+

Preparing Data for Visualization

+

To make the data easier to plot, we use pd.melt() to transform the Actual and Predicted columns into a long format.

+
+
# Melt the Actual and Predicted columns
+melted_df = pd.melt(
+    full_forecast_df,
+    id_vars=['id', 'Date', 'Fold'],  # Columns to keep
+    value_vars=['Actual', 'Predicted'],  # Columns to melt
+    var_name='Type',  # Name for the new column indicating 'Actual' or 'Predicted'
+    value_name='Value'  # Name for the new column with the values
+)
+
+melted_df["unique_id"] = "ID_" + melted_df['id'] + "-Fold_" + melted_df["Fold"].astype(str)
+
+melted_df.glimpse()
+
+
<class 'pandas.core.frame.DataFrame'>: 8120 rows of 6 columns
+id:         object            ['1_1', '1_1', '1_1', '1_1', '1_1', '1_1', ...
+Date:       datetime64[ns]    [Timestamp('2010-08-06 00:00:00'), Timesta ...
+Fold:       int64             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Type:       object            ['Actual', 'Actual', 'Actual', 'Actual', ' ...
+Value:      float64           [17508.41, 15536.4, 15740.13, 15793.87, 16 ...
+unique_id:  object            ['ID_1_1-Fold_1', 'ID_1_1-Fold_1', 'ID_1_1 ...
+
+
+
+
+

Plotting the Forecasts

+

Finally, we use plot_timeseries() to visualize the forecasts, comparing the actual and predicted values for each fold.

+
+
melted_df \
+    .groupby('unique_id') \
+    .plot_timeseries(
+        "Date", "Value",
+        color_column = "Type",
+        smooth=False, 
+        plotly_dropdown=True
+    )
+
+ +
+
+
+
+
+
+
+

4 Conclusion

+

This guide demonstrated how to implement time series cross-validation, engineer features, train a random forest model, and visualize the results in Python. By following this process, you can ensure a robust evaluation of your time series models and gain insights into their predictive performance. Happy modeling!

+
+
+

5 More Coming Soonā€¦

+

We are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ

+ + + +
+ +
+ + +
+ + + + \ No newline at end of file diff --git a/docs/_site/index.html b/docs/_site/index.html index a4b7fc51..f0b5c7e0 100644 --- a/docs/_site/index.html +++ b/docs/_site/index.html @@ -263,6 +263,12 @@ Anomaly Detection + + diff --git a/docs/_site/performance/01_speed_comparisons.html b/docs/_site/performance/01_speed_comparisons.html index 04b2790f..3fa05677 100644 --- a/docs/_site/performance/01_speed_comparisons.html +++ b/docs/_site/performance/01_speed_comparisons.html @@ -245,6 +245,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/TimeSeriesCV.html b/docs/_site/reference/TimeSeriesCV.html index 1d389095..fdaab411 100644 --- a/docs/_site/reference/TimeSeriesCV.html +++ b/docs/_site/reference/TimeSeriesCV.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/TimeSeriesCVSplitter.html b/docs/_site/reference/TimeSeriesCVSplitter.html index 4ea270c9..25b007f8 100644 --- a/docs/_site/reference/TimeSeriesCVSplitter.html +++ b/docs/_site/reference/TimeSeriesCVSplitter.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/anomalize.html b/docs/_site/reference/anomalize.html index 3c53f047..4cb30d41 100644 --- a/docs/_site/reference/anomalize.html +++ b/docs/_site/reference/anomalize.html @@ -252,6 +252,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/apply_by_time.html b/docs/_site/reference/apply_by_time.html index a7a46d36..58141fab 100644 --- a/docs/_site/reference/apply_by_time.html +++ b/docs/_site/reference/apply_by_time.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_atr.html b/docs/_site/reference/augment_atr.html index 5d7b94b9..6ea1141d 100644 --- a/docs/_site/reference/augment_atr.html +++ b/docs/_site/reference/augment_atr.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_bbands.html b/docs/_site/reference/augment_bbands.html index 612153ca..f6dac719 100644 --- a/docs/_site/reference/augment_bbands.html +++ b/docs/_site/reference/augment_bbands.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_cmo.html b/docs/_site/reference/augment_cmo.html index 2f45c771..7d2e1027 100644 --- a/docs/_site/reference/augment_cmo.html +++ b/docs/_site/reference/augment_cmo.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_diffs.html b/docs/_site/reference/augment_diffs.html index 7a74155f..2d897179 100644 --- a/docs/_site/reference/augment_diffs.html +++ b/docs/_site/reference/augment_diffs.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_ewm.html b/docs/_site/reference/augment_ewm.html index d45e064e..d2b6d3b4 100644 --- a/docs/_site/reference/augment_ewm.html +++ b/docs/_site/reference/augment_ewm.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_expanding.html b/docs/_site/reference/augment_expanding.html index 451b5a79..3d51afe3 100644 --- a/docs/_site/reference/augment_expanding.html +++ b/docs/_site/reference/augment_expanding.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_expanding_apply.html b/docs/_site/reference/augment_expanding_apply.html index a9f23799..ce7ceb30 100644 --- a/docs/_site/reference/augment_expanding_apply.html +++ b/docs/_site/reference/augment_expanding_apply.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_fourier.html b/docs/_site/reference/augment_fourier.html index 87293b2d..fb7bb4ae 100644 --- a/docs/_site/reference/augment_fourier.html +++ b/docs/_site/reference/augment_fourier.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_hilbert.html b/docs/_site/reference/augment_hilbert.html index a1e83c3e..1ab87cb2 100644 --- a/docs/_site/reference/augment_hilbert.html +++ b/docs/_site/reference/augment_hilbert.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_holiday_signature.html b/docs/_site/reference/augment_holiday_signature.html index a6e821b2..817a7859 100644 --- a/docs/_site/reference/augment_holiday_signature.html +++ b/docs/_site/reference/augment_holiday_signature.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_lags.html b/docs/_site/reference/augment_lags.html index e1d688f3..1dc99fe0 100644 --- a/docs/_site/reference/augment_lags.html +++ b/docs/_site/reference/augment_lags.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_leads.html b/docs/_site/reference/augment_leads.html index f5c97382..8af645e7 100644 --- a/docs/_site/reference/augment_leads.html +++ b/docs/_site/reference/augment_leads.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_macd.html b/docs/_site/reference/augment_macd.html index b07b755d..cdbcba5a 100644 --- a/docs/_site/reference/augment_macd.html +++ b/docs/_site/reference/augment_macd.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_pct_change.html b/docs/_site/reference/augment_pct_change.html index 0f24a459..928d6713 100644 --- a/docs/_site/reference/augment_pct_change.html +++ b/docs/_site/reference/augment_pct_change.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_ppo.html b/docs/_site/reference/augment_ppo.html index 58e98c87..e12dd7d7 100644 --- a/docs/_site/reference/augment_ppo.html +++ b/docs/_site/reference/augment_ppo.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_qsmomentum.html b/docs/_site/reference/augment_qsmomentum.html index 96a381cd..e0e8bf87 100644 --- a/docs/_site/reference/augment_qsmomentum.html +++ b/docs/_site/reference/augment_qsmomentum.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_roc.html b/docs/_site/reference/augment_roc.html index bb62e27f..36259e17 100644 --- a/docs/_site/reference/augment_roc.html +++ b/docs/_site/reference/augment_roc.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_rolling.html b/docs/_site/reference/augment_rolling.html index fe324b75..cb13b0ff 100644 --- a/docs/_site/reference/augment_rolling.html +++ b/docs/_site/reference/augment_rolling.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_rolling_apply.html b/docs/_site/reference/augment_rolling_apply.html index 80466799..6e811968 100644 --- a/docs/_site/reference/augment_rolling_apply.html +++ b/docs/_site/reference/augment_rolling_apply.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_rsi.html b/docs/_site/reference/augment_rsi.html index 982a08d8..d1d215fe 100644 --- a/docs/_site/reference/augment_rsi.html +++ b/docs/_site/reference/augment_rsi.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_timeseries_signature.html b/docs/_site/reference/augment_timeseries_signature.html index d082bc45..8ddeff51 100644 --- a/docs/_site/reference/augment_timeseries_signature.html +++ b/docs/_site/reference/augment_timeseries_signature.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/augment_wavelet.html b/docs/_site/reference/augment_wavelet.html index 6e997eb2..2ee2cb73 100644 --- a/docs/_site/reference/augment_wavelet.html +++ b/docs/_site/reference/augment_wavelet.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/binarize.html b/docs/_site/reference/binarize.html index a3585b96..c8589d9a 100644 --- a/docs/_site/reference/binarize.html +++ b/docs/_site/reference/binarize.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/ceil_date.html b/docs/_site/reference/ceil_date.html index 078d3d36..24622811 100644 --- a/docs/_site/reference/ceil_date.html +++ b/docs/_site/reference/ceil_date.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/correlate.html b/docs/_site/reference/correlate.html index b000f571..139146df 100644 --- a/docs/_site/reference/correlate.html +++ b/docs/_site/reference/correlate.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/drop_zero_variance.html b/docs/_site/reference/drop_zero_variance.html index 015e420f..545a39d7 100644 --- a/docs/_site/reference/drop_zero_variance.html +++ b/docs/_site/reference/drop_zero_variance.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/filter_by_time.html b/docs/_site/reference/filter_by_time.html index 82d8c422..62291247 100644 --- a/docs/_site/reference/filter_by_time.html +++ b/docs/_site/reference/filter_by_time.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/flatten_multiindex_column_names.html b/docs/_site/reference/flatten_multiindex_column_names.html index edc722d1..cb2f9fab 100644 --- a/docs/_site/reference/flatten_multiindex_column_names.html +++ b/docs/_site/reference/flatten_multiindex_column_names.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/floor_date.html b/docs/_site/reference/floor_date.html index a7349b55..e885e120 100644 --- a/docs/_site/reference/floor_date.html +++ b/docs/_site/reference/floor_date.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/future_frame.html b/docs/_site/reference/future_frame.html index f3a92130..4a66c1f4 100644 --- a/docs/_site/reference/future_frame.html +++ b/docs/_site/reference/future_frame.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_available_datasets.html b/docs/_site/reference/get_available_datasets.html index ca2bc720..7186ded9 100644 --- a/docs/_site/reference/get_available_datasets.html +++ b/docs/_site/reference/get_available_datasets.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_date_summary.html b/docs/_site/reference/get_date_summary.html index 17a7ab1e..2ecff050 100644 --- a/docs/_site/reference/get_date_summary.html +++ b/docs/_site/reference/get_date_summary.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_diff_summary.html b/docs/_site/reference/get_diff_summary.html index b03348d9..2ece5068 100644 --- a/docs/_site/reference/get_diff_summary.html +++ b/docs/_site/reference/get_diff_summary.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_frequency.html b/docs/_site/reference/get_frequency.html index b56a6c4e..864f1f35 100644 --- a/docs/_site/reference/get_frequency.html +++ b/docs/_site/reference/get_frequency.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_frequency_summary.html b/docs/_site/reference/get_frequency_summary.html index 561ef4fe..54f56b8e 100644 --- a/docs/_site/reference/get_frequency_summary.html +++ b/docs/_site/reference/get_frequency_summary.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_holiday_signature.html b/docs/_site/reference/get_holiday_signature.html index 1f114644..62a22661 100644 --- a/docs/_site/reference/get_holiday_signature.html +++ b/docs/_site/reference/get_holiday_signature.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_pandas_frequency.html b/docs/_site/reference/get_pandas_frequency.html index a16d8da7..6562bb07 100644 --- a/docs/_site/reference/get_pandas_frequency.html +++ b/docs/_site/reference/get_pandas_frequency.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_seasonal_frequency.html b/docs/_site/reference/get_seasonal_frequency.html index e59dda1b..95fbd890 100644 --- a/docs/_site/reference/get_seasonal_frequency.html +++ b/docs/_site/reference/get_seasonal_frequency.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_timeseries_signature.html b/docs/_site/reference/get_timeseries_signature.html index 21c079bc..fde01682 100644 --- a/docs/_site/reference/get_timeseries_signature.html +++ b/docs/_site/reference/get_timeseries_signature.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/get_trend_frequency.html b/docs/_site/reference/get_trend_frequency.html index 6eb6b965..0da1f6d3 100644 --- a/docs/_site/reference/get_trend_frequency.html +++ b/docs/_site/reference/get_trend_frequency.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/glimpse.html b/docs/_site/reference/glimpse.html index d38c009b..1d66e49e 100644 --- a/docs/_site/reference/glimpse.html +++ b/docs/_site/reference/glimpse.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/index.html b/docs/_site/reference/index.html index 600cdb05..64365421 100644 --- a/docs/_site/reference/index.html +++ b/docs/_site/reference/index.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/is_holiday.html b/docs/_site/reference/is_holiday.html index b120ca5e..a0786a9d 100644 --- a/docs/_site/reference/is_holiday.html +++ b/docs/_site/reference/is_holiday.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/load_dataset.html b/docs/_site/reference/load_dataset.html index 6824d816..a3610311 100644 --- a/docs/_site/reference/load_dataset.html +++ b/docs/_site/reference/load_dataset.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/make_future_timeseries.html b/docs/_site/reference/make_future_timeseries.html index 348f4dd3..1ec23777 100644 --- a/docs/_site/reference/make_future_timeseries.html +++ b/docs/_site/reference/make_future_timeseries.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/make_weekday_sequence.html b/docs/_site/reference/make_weekday_sequence.html index bd6e3324..6a177704 100644 --- a/docs/_site/reference/make_weekday_sequence.html +++ b/docs/_site/reference/make_weekday_sequence.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/make_weekend_sequence.html b/docs/_site/reference/make_weekend_sequence.html index 08b0bf9f..7ae118be 100644 --- a/docs/_site/reference/make_weekend_sequence.html +++ b/docs/_site/reference/make_weekend_sequence.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/pad_by_time.html b/docs/_site/reference/pad_by_time.html index 4ecafce2..5149e52b 100644 --- a/docs/_site/reference/pad_by_time.html +++ b/docs/_site/reference/pad_by_time.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/palette_timetk.html b/docs/_site/reference/palette_timetk.html index 0e0f2d09..bc5d1197 100644 --- a/docs/_site/reference/palette_timetk.html +++ b/docs/_site/reference/palette_timetk.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/parallel_apply.html b/docs/_site/reference/parallel_apply.html index b37e029d..69d33f90 100644 --- a/docs/_site/reference/parallel_apply.html +++ b/docs/_site/reference/parallel_apply.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_anomalies.html b/docs/_site/reference/plot_anomalies.html index c21c8f5d..3e1f5d35 100644 --- a/docs/_site/reference/plot_anomalies.html +++ b/docs/_site/reference/plot_anomalies.html @@ -252,6 +252,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_anomalies_cleaned.html b/docs/_site/reference/plot_anomalies_cleaned.html index 4c04fc3b..2efe63b8 100644 --- a/docs/_site/reference/plot_anomalies_cleaned.html +++ b/docs/_site/reference/plot_anomalies_cleaned.html @@ -252,6 +252,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_anomalies_decomp.html b/docs/_site/reference/plot_anomalies_decomp.html index 0b91373c..9f8fc805 100644 --- a/docs/_site/reference/plot_anomalies_decomp.html +++ b/docs/_site/reference/plot_anomalies_decomp.html @@ -252,6 +252,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_anomaly_decomp.html b/docs/_site/reference/plot_anomaly_decomp.html index 4e77cfab..44e82f52 100644 --- a/docs/_site/reference/plot_anomaly_decomp.html +++ b/docs/_site/reference/plot_anomaly_decomp.html @@ -252,6 +252,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_correlation_funnel.html b/docs/_site/reference/plot_correlation_funnel.html index 004c5a94..18ab1705 100644 --- a/docs/_site/reference/plot_correlation_funnel.html +++ b/docs/_site/reference/plot_correlation_funnel.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/plot_timeseries.html b/docs/_site/reference/plot_timeseries.html index 500e3b90..0986252c 100644 --- a/docs/_site/reference/plot_timeseries.html +++ b/docs/_site/reference/plot_timeseries.html @@ -251,6 +251,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/progress_apply.html b/docs/_site/reference/progress_apply.html index 31b27c8b..e0a2294d 100644 --- a/docs/_site/reference/progress_apply.html +++ b/docs/_site/reference/progress_apply.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/summarize_by_time.html b/docs/_site/reference/summarize_by_time.html index 746de401..e5ff5341 100644 --- a/docs/_site/reference/summarize_by_time.html +++ b/docs/_site/reference/summarize_by_time.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/theme_timetk.html b/docs/_site/reference/theme_timetk.html index 2aa84568..55e997d7 100644 --- a/docs/_site/reference/theme_timetk.html +++ b/docs/_site/reference/theme_timetk.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/time_scale_template.html b/docs/_site/reference/time_scale_template.html index 2a7131bc..1cebf816 100644 --- a/docs/_site/reference/time_scale_template.html +++ b/docs/_site/reference/time_scale_template.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/timeseries_unit_frequency_table.html b/docs/_site/reference/timeseries_unit_frequency_table.html index 3d0301b6..92ba2578 100644 --- a/docs/_site/reference/timeseries_unit_frequency_table.html +++ b/docs/_site/reference/timeseries_unit_frequency_table.html @@ -235,6 +235,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/transform_columns.html b/docs/_site/reference/transform_columns.html index c578b4db..1c378911 100644 --- a/docs/_site/reference/transform_columns.html +++ b/docs/_site/reference/transform_columns.html @@ -198,6 +198,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/ts_features.html b/docs/_site/reference/ts_features.html index 0d36f831..30b9d5fc 100644 --- a/docs/_site/reference/ts_features.html +++ b/docs/_site/reference/ts_features.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/ts_summary.html b/docs/_site/reference/ts_summary.html index c3a6c901..ddeafa77 100644 --- a/docs/_site/reference/ts_summary.html +++ b/docs/_site/reference/ts_summary.html @@ -236,6 +236,12 @@ Anomaly Detection + + diff --git a/docs/_site/reference/week_of_month.html b/docs/_site/reference/week_of_month.html index 4be92dc5..6e576d2b 100644 --- a/docs/_site/reference/week_of_month.html +++ b/docs/_site/reference/week_of_month.html @@ -232,6 +232,12 @@ Anomaly Detection + + diff --git a/docs/_site/search.json b/docs/_site/search.json index 83e50c12..7949ee54 100644 --- a/docs/_site/search.json +++ b/docs/_site/search.json @@ -1400,102 +1400,67 @@ "text": "3.2 Pad by Time with Grouped Time Series\npad_by_time() can also be used with grouped time series. Letā€™s use the stocks_daily dataset to showcase an example:\n\n\nCode\n# load dataset\nstocks_df = tk.load_dataset('stocks_daily', parse_dates = ['date'])\n\n# pad by time\nstocks_df \\\n .groupby('symbol') \\\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n ) \\\n .assign(id = lambda x: x['symbol'].ffill())\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nid\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\nAAPL\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\nAAPL\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\nAAPL\n\n\n3\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nAAPL\n\n\n4\nAAPL\n2013-01-06\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nAAPL\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNVDA\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\nNVDA\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\nNVDA\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\nNVDA\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\nNVDA\n\n\n\n\n23490 rows Ɨ 9 columns\n\n\n\nTo replace NaN with 0 in a dataframe with multiple columns:\n\n\nCode\nfrom functools import partial\n\n# columns to replace NaN with 0\ncols_to_fill = ['open', 'high', 'low', 'close', 'volume', 'adjusted']\n\n# define a function to fillna\ndef fill_na_col(df, col):\n return df[col].fillna(0)\n\n# pad by time and replace NaN with 0\nstocks_df \\\n .groupby('symbol') \\\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n ) \\\n .assign(id = lambda x: x['symbol'].ffill()) \\\n .assign(**{col: partial(fill_na_col, col=col) for col in cols_to_fill})\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nid\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\nAAPL\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\nAAPL\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\nAAPL\n\n\n3\nAAPL\n2013-01-05\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nAAPL\n\n\n4\nAAPL\n2013-01-06\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nAAPL\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nNVDA\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\nNVDA\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\nNVDA\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\nNVDA\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\nNVDA\n\n\n\n\n23490 rows Ɨ 9 columns" }, { - "objectID": "guides/01_visualization.html", - "href": "guides/01_visualization.html", - "title": "Data Visualization", + "objectID": "guides/07_timeseries_crossvalidation.html", + "href": "guides/07_timeseries_crossvalidation.html", + "title": "Time Series Cross Validation", "section": "", - "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the plot_timeseries() for data visualization. Once you understand how it works, you can apply explore time series data easier than ever.\nThis tutorial focuses on, plot_timeseries(), a workhorse time-series plotting function that:" + "text": "In this tutorial, youā€™ll learn how to use the TimeSeriesCV and TimeSeriesCVSplitter classes from pytimetk for time series cross-validation, using the walmart_sales_df dataset as an example, which contains 7 time series groups.\n\nIn Part 1, weā€™ll start with exploring the data and move on to creating and visualizing time-based cross-validation splits. This will prepare you for the next section with Scikit Learn.\nIn Part 2, weā€™ll implement time series cross-validation with Scikit-Learn, engineer features, train a random forest model, and visualize the results in Python. By following this process, you can ensure a robust evaluation of your time series models and gain insights into their predictive performance." }, { - "objectID": "guides/01_visualization.html#plotting-a-single-time-series", - "href": "guides/01_visualization.html#plotting-a-single-time-series", - "title": "Data Visualization", - "section": "2.1 Plotting a Single Time Series", - "text": "2.1 Plotting a Single Time Series\nLetā€™s start with a popular time series, taylor_30_min, which includes energy demand in megawatts at a sampling interval of 30-minutes. This is a single time series.\n\n\nCode\n# Import a Time Series Data Set\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\ntaylor_30_min\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n\n\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n\n\n\n\n4032 rows Ɨ 2 columns\n\n\n\nThe plot_timeseries() function generates an interactive plotly chart by default.\n\nSimply provide the date variable (time-based column, date_column) and the numeric variable (value_column) that changes over time as the first 2 arguments.\nBy default, the plotting engine is plotly, which is interactive and excellent for data exploration and apps. However, if you require static plots for reports, you can set the engine to engine = ā€˜plotnineā€™ or engine = ā€˜matplotlibā€™.\n\nInteractive plot\n\n\nCode\ntaylor_30_min.plot_timeseries('date', 'value')\n\n\n\n \n\n\nStatic plot\n\n\nCode\ntaylor_30_min.plot_timeseries(\n 'date', 'value',\n engine = 'plotnine'\n)\n\n\n\n\n\n<Figure Size: (700 x 500)>" + "objectID": "guides/07_timeseries_crossvalidation.html#step-1-load-and-explore-the-data", + "href": "guides/07_timeseries_crossvalidation.html#step-1-load-and-explore-the-data", + "title": "Time Series Cross Validation", + "section": "2.1 Step 1: Load and Explore the Data", + "text": "2.1 Step 1: Load and Explore the Data\nFirst, letā€™s load the Walmart sales dataset and explore its structure:\n\n# libraries\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Import Data\nwalmart_sales_df = tk.load_dataset('walmart_sales_weekly')\n\nwalmart_sales_df['Date'] = pd.to_datetime(walmart_sales_df['Date'])\n\nwalmart_sales_df = walmart_sales_df[['id', 'Date', 'Weekly_Sales']]\n\nwalmart_sales_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 1001 rows of 3 columns\nid: object ['1_1', '1_1', '1_1', '1_1', '1_1', '1_ ...\nDate: datetime64[ns] [Timestamp('2010-02-05 00:00:00'), Time ...\nWeekly_Sales: float64 [24924.5, 46039.49, 41595.55, 19403.54, ..." }, { - "objectID": "guides/01_visualization.html#plotting-groups", - "href": "guides/01_visualization.html#plotting-groups", - "title": "Data Visualization", - "section": "2.2 Plotting Groups", - "text": "2.2 Plotting Groups\nNext, letā€™s move on to a dataset with time series groups, m4_monthly, which is a sample of 4 time series from the M4 competition that are sampled at a monthly frequency.\n\n\nCode\n# Import a Time Series Data Set\nm4_monthly = tk.load_dataset(\"m4_monthly\", parse_dates = ['date'])\nm4_monthly\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nM1\n1976-06-01\n8000\n\n\n1\nM1\n1976-07-01\n8350\n\n\n2\nM1\n1976-08-01\n8570\n\n\n3\nM1\n1976-09-01\n7700\n\n\n4\nM1\n1976-10-01\n7080\n\n\n...\n...\n...\n...\n\n\n1569\nM1000\n2015-02-01\n880\n\n\n1570\nM1000\n2015-03-01\n800\n\n\n1571\nM1000\n2015-04-01\n1140\n\n\n1572\nM1000\n2015-05-01\n970\n\n\n1573\nM1000\n2015-06-01\n1430\n\n\n\n\n1574 rows Ɨ 3 columns\n\n\n\nVisualizing grouped data is as simple as grouping the data set with groupby() before run it into the plot_timeseries() function. There are 2 methods:\n\nFacets\nPlotly Dropdown\n\n\nFacets (Subgroups on one plot)\nThis is great to see all time series in one plot. Here are the key points:\n\nGroups can be added using the pandas groupby().\nThese groups are then converted into facets.\nUsing facet_ncol = 2 returns a 2-column faceted plot.\nSetting facet_scales = \"free\" allows the x and y-axes of each plot to scale independently of the other plots.\n\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n facet_ncol = 2, \n facet_scales = \"free\"\n)\n\n\n\n \n\n\n\n\nPlotly Dropdown\nSometimes you have many groups and would prefer to see one plot per group. This can be accomplished with plotly_dropdown. You can adjust the x and y position as follows:\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n plotly_dropdown=True,\n plotly_dropdown_x=0,\n plotly_dropdown_y=1\n)\n\n\n\n \n\n\nThe groups can also be vizualized in the same plot using color_column paramenter. Letā€™s come back to taylor_30_min dataframe.\n\n\nCode\n# load data\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\n\n# extract the month using pandas\ntaylor_30_min['month'] = pd.to_datetime(taylor_30_min['date']).dt.month\n\n# plot groups\ntaylor_30_min.plot_timeseries(\n 'date', 'value', \n color_column = 'month'\n)" + "objectID": "guides/07_timeseries_crossvalidation.html#step-2-visualize-the-time-series-data", + "href": "guides/07_timeseries_crossvalidation.html#step-2-visualize-the-time-series-data", + "title": "Time Series Cross Validation", + "section": "2.2 Step 2: Visualize the Time Series Data", + "text": "2.2 Step 2: Visualize the Time Series Data\nWe can visualize the weekly sales data for different store IDs using the plot_timeseries method from pytimetk:\n\nwalmart_sales_df \\\n .groupby('id') \\\n .plot_timeseries(\n \"Date\", \"Weekly_Sales\",\n plotly_dropdown = True,\n )\n\n\n \n\n\nThis will generate an interactive time series plot, allowing you to explore sales data for different stores using a dropdown." }, { - "objectID": "guides/02_timetk_concepts.html", - "href": "guides/02_timetk_concepts.html", - "title": "PyTimeTK Basics", - "section": "", - "text": "PyTimeTK has one mission: To make time series analysis simpler, easier, and faster in Python. This goal requires some opinionated ways of treating time series in Python. We will conceptually lay out how pytimetk can help.\nLetā€™s first start with how to think about time series data conceptually. Time series data has 3 core properties." + "objectID": "guides/07_timeseries_crossvalidation.html#step-3-set-up-timeseriescv-for-cross-validation", + "href": "guides/07_timeseries_crossvalidation.html#step-3-set-up-timeseriescv-for-cross-validation", + "title": "Time Series Cross Validation", + "section": "2.3 Step 3: Set Up TimeSeriesCV for Cross-Validation", + "text": "2.3 Step 3: Set Up TimeSeriesCV for Cross-Validation\nNow, letā€™s set up a time-based cross-validation scheme using TimeSeriesCV:\n\nfrom pytimetk.crossvalidation import TimeSeriesCV\n\n# Define parameters for TimeSeriesCV\ntscv = TimeSeriesCV(\n frequency=\"weeks\",\n train_size=52, # Use 52 weeks for training\n forecast_horizon=12, # Forecast 12 weeks ahead\n gap=0, # No gap between training and forecast sets\n stride=4, # Move forward by 4 weeks after each split\n window=\"rolling\", # Use a rolling window\n mode=\"backward\" # Generate splits from end to start\n)\n\n# Glimpse the cross-validation splits\ntscv.glimpse(\n walmart_sales_df['Weekly_Sales'], \n time_series=walmart_sales_df['Date']\n)\n\nSplit Number: 1\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-08-05 00:00:00 to 2012-07-27 00:00:00\nForecast Period: 2012-08-03 00:00:00 to 2012-10-19 00:00:00\n\nSplit Number: 2\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-07-08 00:00:00 to 2012-06-29 00:00:00\nForecast Period: 2012-07-06 00:00:00 to 2012-09-21 00:00:00\n\nSplit Number: 3\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-06-10 00:00:00 to 2012-06-01 00:00:00\nForecast Period: 2012-06-08 00:00:00 to 2012-08-24 00:00:00\n\nSplit Number: 4\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-05-13 00:00:00 to 2012-05-04 00:00:00\nForecast Period: 2012-05-11 00:00:00 to 2012-07-27 00:00:00\n\nSplit Number: 5\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-04-15 00:00:00 to 2012-04-06 00:00:00\nForecast Period: 2012-04-13 00:00:00 to 2012-06-29 00:00:00\n\nSplit Number: 6\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-03-18 00:00:00 to 2012-03-09 00:00:00\nForecast Period: 2012-03-16 00:00:00 to 2012-06-01 00:00:00\n\nSplit Number: 7\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-02-18 00:00:00 to 2012-02-10 00:00:00\nForecast Period: 2012-02-17 00:00:00 to 2012-05-04 00:00:00\n\nSplit Number: 8\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2011-01-21 00:00:00 to 2012-01-13 00:00:00\nForecast Period: 2012-01-20 00:00:00 to 2012-04-06 00:00:00\n\nSplit Number: 9\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-12-24 00:00:00 to 2011-12-16 00:00:00\nForecast Period: 2011-12-23 00:00:00 to 2012-03-09 00:00:00\n\nSplit Number: 10\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-11-26 00:00:00 to 2011-11-18 00:00:00\nForecast Period: 2011-11-25 00:00:00 to 2012-02-10 00:00:00\n\nSplit Number: 11\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-10-29 00:00:00 to 2011-10-21 00:00:00\nForecast Period: 2011-10-28 00:00:00 to 2012-01-13 00:00:00\n\nSplit Number: 12\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-10-01 00:00:00 to 2011-09-23 00:00:00\nForecast Period: 2011-09-30 00:00:00 to 2011-12-16 00:00:00\n\nSplit Number: 13\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-09-03 00:00:00 to 2011-08-26 00:00:00\nForecast Period: 2011-09-02 00:00:00 to 2011-11-18 00:00:00\n\nSplit Number: 14\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-08-06 00:00:00 to 2011-07-29 00:00:00\nForecast Period: 2011-08-05 00:00:00 to 2011-10-21 00:00:00\n\nSplit Number: 15\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-07-09 00:00:00 to 2011-07-01 00:00:00\nForecast Period: 2011-07-08 00:00:00 to 2011-09-23 00:00:00\n\nSplit Number: 16\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-06-11 00:00:00 to 2011-06-03 00:00:00\nForecast Period: 2011-06-10 00:00:00 to 2011-08-26 00:00:00\n\nSplit Number: 17\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-05-14 00:00:00 to 2011-05-06 00:00:00\nForecast Period: 2011-05-13 00:00:00 to 2011-07-29 00:00:00\n\nSplit Number: 18\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-04-16 00:00:00 to 2011-04-08 00:00:00\nForecast Period: 2011-04-15 00:00:00 to 2011-07-01 00:00:00\n\nSplit Number: 19\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-03-19 00:00:00 to 2011-03-11 00:00:00\nForecast Period: 2011-03-18 00:00:00 to 2011-06-03 00:00:00\n\nSplit Number: 20\nTrain Shape: (364,), Forecast Shape: (84,)\nTrain Period: 2010-02-19 00:00:00 to 2011-02-11 00:00:00\nForecast Period: 2011-02-18 00:00:00 to 2011-05-06 00:00:00\n\n\n\nThe glimpse method provides a summary of each cross-validation fold, including the start and end dates of the training and forecast periods." }, { - "objectID": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", - "href": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", - "title": "PyTimeTK Basics", - "section": "2.1 Type 1: Pandas DataFrame Operations", - "text": "2.1 Type 1: Pandas DataFrame Operations\nBefore we start using pytimetk, letā€™s make sure our data is set up properly.\n\nTimetk Data Format Compliance\n\n\n\n\n\n\n3 Core Properties Must Be Upheald\n\n\n\n\n\nA pytimetk-Compliant Pandas DataFrame must have:\n\nTime Series Index: A Time Stamp column containing datetime64 values\nValue Column(s): The value column(s) containing float or int values\nGroup Column(s): Optionally for grouped time series analysis, one or more columns containg str or categorical values (shown as an object)\n\nIf these are NOT upheld, this will impact your ability to use pytimetk DataFrame operations.\n\n\n\n\n\n\n\n\n\nInspect the DataFrame\n\n\n\n\n\nUse the tk.glimpse() method to check compliance.\n\n\n\nUsing pytimetk glimpse() method, we can see that we have a compliant data frame with a date column containing datetime64 and a value column containing float64. For grouped analysis we have the id column containing object dtype.\n\n\nCode\n# Tip: Inspect for compliance with glimpse()\nm4_daily_df.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 9743 rows of 3 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Timestamp(' ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4, 2017. ...\n\n\n\n\nGrouped Time Series Analysis with Summarize By Time\nFirst, inspect how the summarize_by_time function works by calling help().\n\n\nCode\n# Review the summarize_by_time documentation (output not shown)\nhelp(tk.summarize_by_time)\n\n\n\n\n\n\n\n\nHelp Doc Info: summarize_by_time()\n\n\n\n\n\n\nThe first parameter is data, indicating this is a DataFrame operation.\nThe Examples show different use cases for how to apply the function on a DataFrame\n\n\n\n\nLetā€™s test the summarize_by_time() DataFrame operation out using the grouped approach with method chaining. DataFrame operations can be used as Pandas methods with method-chaining, which allows us to more succinctly apply time series operations.\n\n\nCode\n# Grouped Summarize By Time with Method Chaining\ndf_summarized = (\n m4_daily_df\n .groupby('id')\n .summarize_by_time(\n date_column = 'date',\n value_column = 'value',\n freq = 'QS', # QS = Quarter Start\n agg_func = [\n 'mean', \n 'median', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max',\n ('range',lambda x: x.max() - x.min()),\n ],\n )\n)\n\ndf_summarized\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_q25\nvalue_q75\nvalue_max\nvalue_range\n\n\n\n\n0\nD10\n2014-07-01\n1960.078889\n1979.90\n1781.6\n1915.225\n2002.575\n2076.2\n294.6\n\n\n1\nD10\n2014-10-01\n2184.586957\n2154.05\n2022.8\n2125.075\n2274.150\n2344.9\n322.1\n\n\n2\nD10\n2015-01-01\n2309.830000\n2312.30\n2209.6\n2284.575\n2342.150\n2392.4\n182.8\n\n\n3\nD10\n2015-04-01\n2344.481319\n2333.00\n2185.1\n2301.750\n2391.000\n2499.8\n314.7\n\n\n4\nD10\n2015-07-01\n2156.754348\n2186.70\n1856.6\n1997.250\n2289.425\n2368.1\n511.5\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n105\nD500\n2011-07-01\n9727.321739\n9745.55\n8964.5\n9534.125\n10003.900\n10463.9\n1499.4\n\n\n106\nD500\n2011-10-01\n8175.565217\n7897.00\n6755.0\n7669.875\n8592.575\n9860.0\n3105.0\n\n\n107\nD500\n2012-01-01\n8291.317582\n8412.60\n7471.5\n7814.800\n8677.850\n8980.7\n1509.2\n\n\n108\nD500\n2012-04-01\n8654.020879\n8471.10\n8245.6\n8389.850\n9017.250\n9349.2\n1103.6\n\n\n109\nD500\n2012-07-01\n8770.502353\n8690.50\n8348.1\n8604.400\n8846.000\n9545.3\n1197.2\n\n\n\n\n110 rows Ɨ 9 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: summarize_by_time()\n\n\n\n\n\n\nThe data must comply with the 3 core properties (date column, value column(s), and group column(s))\nThe aggregation functions were applied by combination of group (id) and resample (Quarter Start)\nThe result was a pandas DataFrame with group column, resampled date column, and summary values (mean, median, min, 25th-quantile, etc)\n\n\n\n\n\n\nAnother DataFrame Example: Creating 29 Engineered Features\nLetā€™s examine another DataFrame function, tk.augment_timeseries_signature(). Feel free to inspect the documentation with help(tk.augment_timeseries_signature).\n\n\nCode\n# Creating 29 engineered features from the date column\n# Not run: help(tk.augment_timeseries_signature)\ndf_augmented = (\n m4_daily_df\n .augment_timeseries_signature(date_column = 'date')\n)\n\ndf_augmented.head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows Ɨ 32 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: augment_timeseries_signature()\n\n\n\n\n\n\nThe data must comply with the 1 of the 3 core properties (date column)\nThe result was a pandas DataFrame with 29 time series features that can be used for Machine Learning and Forecasting\n\n\n\n\n\n\nMaking Future Dates with Future Frame\nA common time series task before forecasting with machine learning models is to make a future DataFrame some length_out into the future. You can do this with tk.future_frame(). Hereā€™s how.\n\n\nCode\n# Preparing a time series data set for Machine Learning Forecasting\nfull_augmented_df = (\n m4_daily_df \n .groupby('id')\n .future_frame('date', length_out = 365)\n .augment_timeseries_signature('date')\n)\nfull_augmented_df\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n11203 rows Ɨ 32 columns\n\n\n\nWe can then get the future data by keying in on the data with value column that is missing (np.nan).\n\n\nCode\n# Get the future data (just the observations that haven't happened yet)\nfuture_df = (\n full_augmented_df\n .query('value.isna()')\n)\nfuture_df\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n9743\nD10\n2016-05-07\nNaN\n1462579200\n2016\n2016\n0\n0\n1\n1\n...\n7\n37\n128\n0\n0\n0\n0\n0\n0\nam\n\n\n9744\nD10\n2016-05-08\nNaN\n1462665600\n2016\n2016\n0\n0\n1\n1\n...\n8\n38\n129\n1\n0\n0\n0\n0\n0\nam\n\n\n9745\nD10\n2016-05-09\nNaN\n1462752000\n2016\n2016\n0\n0\n1\n1\n...\n9\n39\n130\n0\n0\n0\n0\n0\n0\nam\n\n\n9746\nD10\n2016-05-10\nNaN\n1462838400\n2016\n2016\n0\n0\n1\n1\n...\n10\n40\n131\n0\n0\n0\n0\n0\n0\nam\n\n\n9747\nD10\n2016-05-11\nNaN\n1462924800\n2016\n2016\n0\n0\n1\n1\n...\n11\n41\n132\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n1460 rows Ɨ 32 columns" + "objectID": "guides/07_timeseries_crossvalidation.html#step-4-plot-the-cross-validation-splits", + "href": "guides/07_timeseries_crossvalidation.html#step-4-plot-the-cross-validation-splits", + "title": "Time Series Cross Validation", + "section": "2.4 Step 4: Plot the Cross-Validation Splits", + "text": "2.4 Step 4: Plot the Cross-Validation Splits\nYou can visualize how the data is split for training and testing:\n\n# Plot the cross-validation splits\ntscv.plot(\n walmart_sales_df['Weekly_Sales'], \n time_series=walmart_sales_df['Date']\n)\n\n\n \n\n\nThis plot will show each fold, illustrating which weeks are used for training and which weeks are used for forecasting." }, { - "objectID": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", - "href": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", - "title": "PyTimeTK Basics", - "section": "2.2 Type 2: Pandas Series Operations", - "text": "2.2 Type 2: Pandas Series Operations\nThe main difference between a DataFrame operation and a Series operation is that we are operating on an array of values from typically one of the following dtypes:\n\nTimestamps (datetime64)\nNumeric (float64 or int64)\n\nThe first argument of Series operations that operate on Timestamps will always be idx.\nLetā€™s take a look at one shall we? Weā€™ll start with a common action: Making future time series from an existing time series with a regular frequency.\n\nThe Make Future Time Series Function\nSay we have a monthly sequence of timestamps. What if we want to create a forecast where we predict 12 months into the future? Well, we will need to create 12 future timestamps. Hereā€™s how.\nFirst create a pd.date_range() with dates starting at the beginning of each month.\n\n\nCode\n# Make a monthly date range\ndates_dt = pd.date_range(\"2023-01\", \"2024-01\", freq=\"MS\")\ndates_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',\n '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',\n '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01',\n '2024-01-01'],\n dtype='datetime64[ns]', freq='MS')\n\n\nNext, use tk.make_future_timeseries() to create the next 12 timestamps in the sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas Series: Future Dates\nfuture_series = pd.Series(dates_dt).make_future_timeseries(12)\nfuture_series\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# DateTimeIndex: Future Dates\nfuture_dt = tk.make_future_timeseries(\n idx = dates_dt,\n length_out = 12\n)\nfuture_dt\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\nWe can combine the actual and future timestamps into one combined timeseries.\n\n\nCode\n# Combining the 2 series and resetting the index\ncombined_timeseries = (\n pd.concat(\n [pd.Series(dates_dt), pd.Series(future_dt)],\n axis=0\n )\n .reset_index(drop = True)\n)\n\ncombined_timeseries\n\n\n0 2023-01-01\n1 2023-02-01\n2 2023-03-01\n3 2023-04-01\n4 2023-05-01\n5 2023-06-01\n6 2023-07-01\n7 2023-08-01\n8 2023-09-01\n9 2023-10-01\n10 2023-11-01\n11 2023-12-01\n12 2024-01-01\n13 2024-02-01\n14 2024-03-01\n15 2024-04-01\n16 2024-05-01\n17 2024-06-01\n18 2024-07-01\n19 2024-08-01\n20 2024-09-01\n21 2024-10-01\n22 2024-11-01\n23 2024-12-01\n24 2025-01-01\ndtype: datetime64[ns]\n\n\nNext, weā€™ll take a look at how to go from an irregular time series to a regular time series.\n\n\nFlooring Dates\nAn example is tk.floor_date, which is used to round down dates. See help(tk.floor_date).\nFlooring dates is often used as part of a strategy to go from an irregular time series to regular by combining with an aggregation. Often summarize_by_time() is used (Iā€™ll share why shortly). But conceptually, date flooring is the secret.\n\nWith FlooringWithout Flooring\n\n\n\n\nCode\n# Monthly flooring rounds dates down to 1st of the month\nm4_daily_df['date'].floor_date(unit = \"M\")\n\n\n0 2014-07-01\n1 2014-07-01\n2 2014-07-01\n3 2014-07-01\n4 2014-07-01\n ... \n9738 2014-07-01\n9739 2014-07-01\n9740 2014-07-01\n9741 2014-07-01\n9742 2014-07-01\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# Before Flooring\nm4_daily_df['date']\n\n\n0 2014-07-03\n1 2014-07-03\n2 2014-07-03\n3 2014-07-03\n4 2014-07-03\n ... \n9738 2014-07-03\n9739 2014-07-03\n9740 2014-07-03\n9741 2014-07-03\n9742 2014-07-03\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\nThis ā€œdate flooringā€ operation can be useful for creating date groupings.\n\n\nCode\n# Adding a date group with floor_date()\ndates_grouped_by_month = (\n m4_daily_df\n .assign(date_group = lambda x: x['date'].floor_date(\"M\"))\n)\n\ndates_grouped_by_month\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_group\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2014-07-01\n\n\n1\nD10\n2014-07-03\n2073.4\n2014-07-01\n\n\n2\nD10\n2014-07-03\n2048.7\n2014-07-01\n\n\n3\nD10\n2014-07-03\n2048.9\n2014-07-01\n\n\n4\nD10\n2014-07-03\n2006.4\n2014-07-01\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2014-07-03\n9418.8\n2014-07-01\n\n\n9739\nD500\n2014-07-03\n9365.7\n2014-07-01\n\n\n9740\nD500\n2014-07-03\n9445.9\n2014-07-01\n\n\n9741\nD500\n2014-07-03\n9497.9\n2014-07-01\n\n\n9742\nD500\n2014-07-03\n9545.3\n2014-07-01\n\n\n\n\n9743 rows Ɨ 4 columns\n\n\n\nWe can then do grouped operations.\n\n\nCode\n# Example of a grouped operation with floored dates\nsummary_df = (\n dates_grouped_by_month\n .drop('date', axis=1) \\\n .groupby(['id', 'date_group'])\n .mean() \\\n .reset_index()\n)\n\nsummary_df\n\n\n\n\n\n\n\n\n\nid\ndate_group\nvalue\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n\n\n1\nD160\n2014-07-01\n9243.155254\n\n\n2\nD410\n2014-07-01\n8259.786346\n\n\n3\nD500\n2014-07-01\n8287.728789\n\n\n\n\n\n\n\nOf course for this operation, we can do it faster with summarize_by_time() (and itā€™s much more flexible).\n\n\nCode\n# Summarize by time is less code and more flexible\n(\n m4_daily_df \n .groupby('id')\n .summarize_by_time(\n 'date', 'value', \n freq = \"MS\",\n agg_func = ['mean', 'median', 'min', 'max']\n )\n)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_max\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n2302.30\n1781.60\n2649.30\n\n\n1\nD160\n2014-07-01\n9243.155254\n10097.30\n1734.90\n19432.50\n\n\n2\nD410\n2014-07-01\n8259.786346\n8382.81\n6309.38\n9540.62\n\n\n3\nD500\n2014-07-01\n8287.728789\n7662.10\n4172.10\n14954.10\n\n\n\n\n\n\n\nAnd thatā€™s the core idea behind pytimetk, writing less code and getting more.\nNext, letā€™s do one more function. The brother of augment_timeseries_signature()ā€¦\n\n\nThe Get Time Series Signature Function\nThis function takes a pandas Series or DateTimeIndex and returns a DataFrame containing the 29 engineered features.\nStart with either a DateTimeIndexā€¦\n\n\nCode\ntimestamps_dt = pd.date_range(\"2023\", \"2024\", freq = \"D\")\ntimestamps_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10',\n ...\n '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-26',\n '2023-12-27', '2023-12-28', '2023-12-29', '2023-12-30',\n '2023-12-31', '2024-01-01'],\n dtype='datetime64[ns]', length=366, freq='D')\n\n\nā€¦ Or a Pandas Series.\n\n\nCode\ntimestamps_series = pd.Series(timestamps_dt)\ntimestamps_series\n\n\n0 2023-01-01\n1 2023-01-02\n2 2023-01-03\n3 2023-01-04\n4 2023-01-05\n ... \n361 2023-12-28\n362 2023-12-29\n363 2023-12-30\n364 2023-12-31\n365 2024-01-01\nLength: 366, dtype: datetime64[ns]\n\n\nAnd you can use the pandas Series function, tk.get_timeseries_signature() to create 29 features from the date sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas series: get_timeseries_signature\ntimestamps_series.get_timeseries_signature()\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows Ɨ 30 columns\n\n\n\n\n\n\n\nCode\n# DateTimeIndex: get_timeseries_signature\ntk.get_timeseries_signature(timestamps_dt)\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows Ɨ 30 columns" + "objectID": "guides/07_timeseries_crossvalidation.html#step-1-setting-up-the-timeseriescvsplitter", + "href": "guides/07_timeseries_crossvalidation.html#step-1-setting-up-the-timeseriescvsplitter", + "title": "Time Series Cross Validation", + "section": "3.1 Step 1: Setting Up the TimeSeriesCVSplitter", + "text": "3.1 Step 1: Setting Up the TimeSeriesCVSplitter\nThe TimeSeriesCVSplitter helps us divide our dataset into training and forecast sets in a rolling window fashion. Hereā€™s how we configure it:\n\nfrom pytimetk.crossvalidation import TimeSeriesCVSplitter\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import cross_val_score\n\n# Set up TimeSeriesCVSplitter\ncv_splitter = TimeSeriesCVSplitter(\n time_series=walmart_sales_df['Date'],\n frequency=\"weeks\",\n train_size=52*2,\n forecast_horizon=12,\n gap=0,\n stride=4,\n window=\"rolling\",\n mode=\"backward\",\n split_limit = 5\n)\n\n# Visualize the TSCV Strategy\ncv_splitter.splitter.plot(walmart_sales_df['Weekly_Sales'], walmart_sales_df['Date'])\n\n\n \n\n\nThe TimeSeriesCVSplitter creates multiple splits of the time series data, allowing us to validate the model across different periods. By visualizing the cross-validation strategy, we can see how the training and forecast sets are structured." }, { - "objectID": "getting-started/02_quick_start.html", - "href": "getting-started/02_quick_start.html", - "title": "Quick Start", - "section": "", - "text": "This is a simple exercise to showcase the power of our 2 most popular function:\n\nsummarize_by_time()\nplot_timeseries()\n\n\n\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns\n\n\n\n\n\n\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" + "objectID": "guides/07_timeseries_crossvalidation.html#step-2-feature-engineering-for-time-series-data", + "href": "guides/07_timeseries_crossvalidation.html#step-2-feature-engineering-for-time-series-data", + "title": "Time Series Cross Validation", + "section": "3.2 Step 2: Feature Engineering for Time Series Data", + "text": "3.2 Step 2: Feature Engineering for Time Series Data\nEffective feature engineering can significantly impact the performance of a time series model. Using pytimetk, we extract a variety of features from the Date column.\n\nGenerating Time Series Features\nWe use get_timeseries_signature to generate useful features, such as year, quarter, month, and day-of-week indicators.\n\n# Prepare data for modeling\n\n# Extract time series features from the 'Date' column\nX_time_features = tk.get_timeseries_signature(walmart_sales_df['Date'])\n\n# Select features to dummy encode\nfeatures_to_dummy = ['Date_quarteryear', 'Date_month_lbl', 'Date_wday_lbl', 'Date_am_pm']\n\n# Dummy encode the selected features\nX_time_dummies = pd.get_dummies(X_time_features[features_to_dummy], drop_first=True)\n\n# Dummy encode the 'id' column\nX_id_dummies = pd.get_dummies(walmart_sales_df['id'], prefix='store')\n\n# Combine the time series features, dummy-encoded features, and the 'id' dummies\nX = pd.concat([X_time_features, X_time_dummies, X_id_dummies], axis=1)\n\n# Drop the original categorical columns that were dummy encoded\nX = X.drop(columns=features_to_dummy).drop('Date', axis=1)\n\n# Set the target variable\ny = walmart_sales_df['Weekly_Sales'].values" }, { - "objectID": "getting-started/02_quick_start.html#import-libraries-data", - "href": "getting-started/02_quick_start.html#import-libraries-data", - "title": "Quick Start", - "section": "", - "text": "First, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns" + "objectID": "guides/07_timeseries_crossvalidation.html#step-3-model-training-and-evaluation-with-random-forest", + "href": "guides/07_timeseries_crossvalidation.html#step-3-model-training-and-evaluation-with-random-forest", + "title": "Time Series Cross Validation", + "section": "3.3 Step 3: Model Training and Evaluation with Random Forest", + "text": "3.3 Step 3: Model Training and Evaluation with Random Forest\nFor this example, we use RandomForestRegressor from scikit-learn to model the time series data. A random forest is a robust, ensemble-based model that can handle a wide range of regression tasks.\n\n# Initialize the RandomForestRegressor model\nmodel = RandomForestRegressor(\n n_estimators=100, # Number of trees in the forest\n max_depth=None, # Maximum depth of the trees (None means nodes are expanded until all leaves are pure)\n random_state=42 # Set a random state for reproducibility\n)\n\n# Evaluate the model using cross-validation scores\nscores = cross_val_score(model, X, y, cv=cv_splitter, scoring='neg_mean_squared_error')\n\n# Print cross-validation scores\nprint(\"Cross-Validation Scores (Negative MSE):\", scores)\n\nCross-Validation Scores (Negative MSE): [-23761708.80112538 -23107644.58461143 -21728878.18790144\n -25113860.93913386 -86192034.48953015]" }, { - "objectID": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", - "href": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", - "title": "Quick Start", - "section": "", - "text": "Your company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440" - }, - { - "objectID": "getting-started/02_quick_start.html#visualizing-sales-patterns", - "href": "getting-started/02_quick_start.html#visualizing-sales-patterns", - "title": "Quick Start", - "section": "", - "text": "Now available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" - }, - { - "objectID": "index.html", - "href": "index.html", - "title": "PyTimeTK ", - "section": "", - "text": "Time series easier, faster, more fun. Pytimetk.\n\nPyTimetkā€™s Mission: To make time series analysis easier, faster, and more enjoyable in Python.\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\n\n1 Introducing pytimetk: Simplifying Time Series Analysis for Everyone\nTime series analysis is fundamental in many fields, from business forecasting to scientific research. While the Python ecosystem offers tools like pandas, they sometimes can be verbose and not optimized for all operations, especially for complex time-based aggregations and visualizations.\nEnter pytimetk. Crafted with a blend of ease-of-use and computational efficiency, pytimetk significantly simplifies the process of time series manipulation and visualization. By leveraging the polars backend, you can experience speed improvements ranging from 3X to a whopping 3500X. Letā€™s dive into a comparative analysis.\n\n\n\n\n\n\n\n\nFeatures/Properties\npytimetk\npandas (+matplotlib)\n\n\n\n\nSpeed\nšŸš€ 3X to 500X Faster\nšŸ¢ Standard\n\n\nCode Simplicity\nšŸŽ‰ Concise, readable syntax\nšŸ“œ Often verbose\n\n\nplot_timeseries()\nšŸŽØ 2 lines, no customization\nšŸŽØ 16 lines, customization needed\n\n\nsummarize_by_time()\nšŸ• 2 lines, 13.4X faster\nšŸ• 6 lines, 2 for-loops\n\n\npad_by_time()\nā›³ 2 lines, fills gaps in timeseries\nāŒ No equivalent\n\n\nanomalize()\nšŸ“ˆ 2 lines, detects and corrects anomalies\nāŒ No equivalent\n\n\naugment_timeseries_signature()\nšŸ“… 1 line, all calendar features\nšŸ• 30 lines of dt extractors\n\n\naugment_rolling()\nšŸŽļø 10X to 3500X faster\nšŸ¢ Slow Rolling Operations\n\n\n\nAs evident from the table, pytimetk is not just about speed; it also simplifies your codebase. For example, summarize_by_time(), converts a 6-line, double for-loop routine in pandas into a concise 2-line operation. And with the polars engine, get results 13.4X faster than pandas!\nSimilarly, plot_timeseries() dramatically streamlines the plotting process, encapsulating what would typically require 16 lines of matplotlib code into a mere 2-line command in pytimetk, without sacrificing customization or quality. And with plotly and plotnine engines, you can create interactive plots and beautiful static visualizations with just a few lines of code.\nFor calendar features, pytimetk offers augment_timeseries_signature() which cuts down on over 30 lines of pandas dt extractions. For rolling features, pytimetk offers augment_rolling(), which is 10X to 3500X faster than pandas. It also offers pad_by_time() to fill gaps in your time series data, and anomalize() to detect and correct anomalies in your time series data.\nJoin the revolution in time series analysis. Reduce your code complexity, increase your productivity, and harness the speed that pytimetk brings to your workflows.\nExplore more at our pytimetk homepage.\n\n\n2 šŸš€ Installation\nInstall the Latest Stable Version:\npip install pytimetk\nAlternatively, install the Development GitHub Version:\npip install git+https://github.com/business-science/pytimetk.git\n\n\n3 šŸ Quick Start: A Monthly Sales Analysis\nThis is a simple exercise to showcase the power of summarize_by_time():\n\nImport Libraries & Data\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns\n\n\n\n\n\nUsing summarize_by_time() for a Sales Analysis\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default). The default engine is \"pandas\". Selecting engine = \"polars\" allows us to improve the speed of the function.\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False,\n engine = \"polars\"\n )\n\n# Quickly examine each column\nsummary_category_1_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 24 rows of 3 columns\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\norder_date: datetime64[ns] [Timestamp('2011-01-01 00:00:00'), T ...\ntotal_price_sum: int64 [221490, 660555, 358855, 1075975, 45 ...\n\n\n\n\nVisualizing Sales Patterns\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\nThe default engine in ā€œplotnineā€ for static plotting. Setting the engine = \"plotly\" returns an interactive plot.\n\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n smooth_frac = 0.8,\n engine = \"plotly\"\n )\n\n\n \n\n\n\n\n\n4 šŸ“š Documentation\nNext step? Learn more with the pytimetk documentation\n\nšŸ“ˆ Overview\nšŸ Getting Started\nšŸ—ŗļø Beginner Guides\nšŸ“˜Applied Data Science Tutorials with PyTimeTK\nšŸŽļøSpeed Comparisons\nšŸ“„ API Reference\n\n\n\n5 šŸ» Contributing\nInterested in helping us make this the best Python package for time series analysis? Weā€™d love your help.\nFollow these instructions to Contribute.\n\n\n6 šŸ† More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." - }, - { - "objectID": "contributing.html", - "href": "contributing.html", - "title": "Contributing (Developer Setup)", - "section": "", - "text": "Interested in contributing?\n\n\n\n\n\nMake sure to Fork the GitHub Repo. Clone your fork. Then use poetry to install the pytimetk package.\n\n\n\n\n1 GitHub\nTo contribute, youā€™ll need to have a GitHub account. Then:\n\n1. Fork our pytimetk repository\nHead to our GitHub Repo and select ā€œforkā€. This makes a copied version of pytimetk for your personal use.\n\n\n2. Clone your forked version\nCloning will put your own personal version of pytimetk on your local machine. Make sure to replace [your_user_name] with your user name.\ngit clone https://github.com/[your_user_name]/pytimetk\n\n\n\n2 Poetry Environment Setup\nTo install pytimetk using Poetry, follow these steps:\n\n1. Prerequisites\nMake sure you have Python 3.9 or later installed on your system.\n\n\n2. Install Poetry\nTo install Poetry, you can use the official installer provided by Poetry. Do not use pip.\n\n\n3. Install Dependencies\nUse Poetry to install the package and its dependencies:\npoetry install\nor you can create a virtualenv with poetry and install the dependencies\npoetry shell\npoetry install\n\n\n\n3 Submit a Pull Request\n\n1. Make changes on a Branch\nMake changes in your local version on a branch where my-feature-branch is a branch youā€™d like to create that contains modifications.\ngit checkout -b my-feature-branch\n\n\n2. Push to your forked version of pytimetk\ngit push origin my-feature-branch\n\n\n3. Create a Pull Request\n\nGo to your forked repository on GitHub and switch to your branch.\nClick on ā€œNew pull requestā€ and compare the changes you made with the original repository.\nFill out the pull request template with the necessary information, explaining your changes, the reason for them, and any other relevant information.\n\n\n\n4. Submit the Pull Request\n\nReview your changes and submit the pull request.\n\n\n\n\n4 Next Steps šŸ»\nWe will review your PR. If all goes well, weā€™ll merge! And then youā€™ve just helped the community. šŸ»" - }, - { - "objectID": "getting-started/01_installation.html", - "href": "getting-started/01_installation.html", - "title": "Install", - "section": "", - "text": "1 Quick Install\nLetā€™s get you up and running with pytimetk fast with the latest stable release.\npip install pytimetk\nYou can install from GitHub with this code.\npip install git+https://github.com/business-science/pytimetk.git\n\n\n2 Next steps\nCheck out the Quick Start Guide Next.\n\n\n3 More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." - }, - { - "objectID": "guides/03_pandas_frequency.html", - "href": "guides/03_pandas_frequency.html", - "title": "Pandas Frequencies", - "section": "", - "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the pandas frequency strings within pytimetk. Once you understand key frequencies, you can apply them to manipulate time series data like a pro.\n\n\n\n\n1 Pandas Frequencies\nPandas offers a variety of frequency strings, also known as offset aliases, to define the frequency of a time series. Here are some common frequency strings used in pandas:\n\nā€˜Bā€™: Business Day\nā€˜Dā€™: Calendar day\nā€˜Wā€™: Weekly\nā€˜Mā€™: Month end\nā€˜BMā€™: Business month end\nā€˜MSā€™: Month start\nā€˜BMSā€™: Business month start\nā€˜Qā€™: Quarter end\nā€˜BQā€™: Business quarter end\nā€˜QSā€™: Quarter start\nā€˜BQSā€™: Business quarter start\nā€˜Aā€™ or ā€˜Yā€™: Year end\nā€˜BAā€™ or ā€˜BYā€™: Business year end\nā€˜ASā€™ or ā€˜YSā€™: Year start\nā€˜BASā€™ or ā€˜BYSā€™: Business year start\nā€˜Hā€™: Hourly\nā€˜Tā€™ or ā€˜minā€™: Minutely\nā€˜Sā€™: Secondly\nā€˜Lā€™ or ā€˜msā€™: Milliseconds\nā€˜Uā€™: Microseconds\nā€˜Nā€™: Nanoseconds\n\n\nCustom Frequencies:\n\nYou can also create custom frequencies by combining base frequencies, like:\n\nā€˜2Dā€™: Every 2 days\nā€˜3Wā€™: Every 3 weeks\nā€˜4Hā€™: Every 4 hours\nā€˜1H30Tā€™: Every 1 hour and 30 minutes\n\n\n\n\nCompound Frequencies:\n\nYou can combine multiple frequencies by adding them together.\n\nā€˜1D1Hā€™: 1 day and 1 hour\nā€˜1H30Tā€™: 1 hour and 30 minutes\n\n\n\n\nExample:\n\n\nCode\nimport pandas as pd\n\n# Creating a date range with daily frequency\ndate_range_daily = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')\n\ndate_range_daily\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10'],\n dtype='datetime64[ns]', freq='D')\n\n\n\n\nCode\n# Creating a date range with 2 days frequency\ndate_range_two_days = pd.date_range(start='2023-01-01', end='2023-01-10', freq='2D')\n\ndate_range_two_days\n\n\nDatetimeIndex(['2023-01-01', '2023-01-03', '2023-01-05', '2023-01-07',\n '2023-01-09'],\n dtype='datetime64[ns]', freq='2D')\n\n\nThese frequency strings help in resampling, creating date ranges, and handling time-series data efficiently in pandas.\n\n\n\n2 Timetk Incorporates Pandas Frequencies\nNow that youā€™ve seen pandas frequencies, youā€™ll see them pop up in many of the pytimetk functions.\n\nExample: Padding Dates\nThis example shows how to use Pandas frequencies inside of pytimetk functions.\nWeā€™ll use pad_by_time to show how to use freq to fill in missing dates.\n\n\nCode\n# DataFrame with missing dates\nimport pandas as pd\n\ndata = {\n # '2023-09-05' is missing\n 'datetime': ['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-06'], \n 'value': [10, 30, 40, 50, 60]\n}\n\ndf = pd.DataFrame(data)\ndf['datetime'] = pd.to_datetime(df['datetime'])\ndf\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10\n\n\n1\n2023-09-02\n30\n\n\n2\n2023-09-03\n40\n\n\n3\n2023-09-04\n50\n\n\n4\n2023-09-06\n60\n\n\n\n\n\n\n\nWe can resample to fill in the missing day using pad_by_time with freq = 'D'.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = 'D')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10.0\n\n\n1\n2023-09-02\n30.0\n\n\n2\n2023-09-03\n40.0\n\n\n3\n2023-09-04\n50.0\n\n\n4\n2023-09-05\nNaN\n\n\n5\n2023-09-06\n60.0\n\n\n\n\n\n\n\nWhat about resampling every 12 hours? Just set `freq = ā€˜12Hā€™.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = '12H')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01 00:00:00\n10.0\n\n\n1\n2023-09-01 12:00:00\nNaN\n\n\n2\n2023-09-02 00:00:00\n30.0\n\n\n3\n2023-09-02 12:00:00\nNaN\n\n\n4\n2023-09-03 00:00:00\n40.0\n\n\n5\n2023-09-03 12:00:00\nNaN\n\n\n6\n2023-09-04 00:00:00\n50.0\n\n\n7\n2023-09-04 12:00:00\nNaN\n\n\n8\n2023-09-05 00:00:00\nNaN\n\n\n9\n2023-09-05 12:00:00\nNaN\n\n\n10\n2023-09-06 00:00:00\n60.0\n\n\n\n\n\n\n\nYouā€™ll see these pandas frequencies come up as the parameter freq in many pytimetk functions.\n\n\n\n3 Next Steps\nCheck out the Data Wrangling Guide next.\n\n\n4 More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." + "objectID": "guides/07_timeseries_crossvalidation.html#step-4-visualizing-the-forecast", + "href": "guides/07_timeseries_crossvalidation.html#step-4-visualizing-the-forecast", + "title": "Time Series Cross Validation", + "section": "3.4 Step 4: Visualizing the Forecast", + "text": "3.4 Step 4: Visualizing the Forecast\nVisualization is crucial to understand how well the model predicts future values. We collect the actual and predicted values for each fold and combine them for easy plotting.\n\n# Lists to store the combined data\ncombined_data = []\n\n# Iterate through each fold and collect the data\nfor i, (train_index, test_index) in enumerate(cv_splitter.split(X, y), start=1):\n # Get the training and forecast data from the original DataFrame\n train_df = walmart_sales_df.iloc[train_index].copy()\n test_df = walmart_sales_df.iloc[test_index].copy()\n \n # Fit the model on the training data\n model.fit(X.iloc[train_index], y[train_index])\n \n # Predict on the test set\n y_pred = model.predict(X.iloc[test_index])\n \n # Add the actual and predicted values\n train_df['Actual'] = y[train_index]\n train_df['Predicted'] = None # No predictions for training data\n train_df['Fold'] = i # Indicate the current fold\n \n test_df['Actual'] = y[test_index]\n test_df['Predicted'] = y_pred # Predictions for the test data\n test_df['Fold'] = i # Indicate the current fold\n \n # Append both the training and forecast DataFrames to the combined data list\n combined_data.extend([train_df, test_df])\n\n# Combine all the data into a single DataFrame\nfull_forecast_df = pd.concat(combined_data, ignore_index=True)\n\nfull_forecast_df = full_forecast_df[['id', 'Date', 'Actual', 'Predicted', 'Fold']]\n\nfull_forecast_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 4060 rows of 5 columns\nid: object ['1_1', '1_1', '1_1', '1_1', '1_1', '1_1', ...\nDate: datetime64[ns] [Timestamp('2010-08-06 00:00:00'), Timesta ...\nActual: float64 [17508.41, 15536.4, 15740.13, 15793.87, 16 ...\nPredicted: float64 [nan, nan, nan, nan, nan, nan, nan, nan, n ...\nFold: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n\n\n\nPreparing Data for Visualization\nTo make the data easier to plot, we use pd.melt() to transform the Actual and Predicted columns into a long format.\n\n# Melt the Actual and Predicted columns\nmelted_df = pd.melt(\n full_forecast_df,\n id_vars=['id', 'Date', 'Fold'], # Columns to keep\n value_vars=['Actual', 'Predicted'], # Columns to melt\n var_name='Type', # Name for the new column indicating 'Actual' or 'Predicted'\n value_name='Value' # Name for the new column with the values\n)\n\nmelted_df[\"unique_id\"] = \"ID_\" + melted_df['id'] + \"-Fold_\" + melted_df[\"Fold\"].astype(str)\n\nmelted_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 8120 rows of 6 columns\nid: object ['1_1', '1_1', '1_1', '1_1', '1_1', '1_1', ...\nDate: datetime64[ns] [Timestamp('2010-08-06 00:00:00'), Timesta ...\nFold: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nType: object ['Actual', 'Actual', 'Actual', 'Actual', ' ...\nValue: float64 [17508.41, 15536.4, 15740.13, 15793.87, 16 ...\nunique_id: object ['ID_1_1-Fold_1', 'ID_1_1-Fold_1', 'ID_1_1 ...\n\n\n\n\nPlotting the Forecasts\nFinally, we use plot_timeseries() to visualize the forecasts, comparing the actual and predicted values for each fold.\n\nmelted_df \\\n .groupby('unique_id') \\\n .plot_timeseries(\n \"Date\", \"Value\",\n color_column = \"Type\",\n smooth=False, \n plotly_dropdown=True\n )" }, { "objectID": "guides/06_anomalize.html", @@ -1546,6 +1511,104 @@ "section": "1.6 Changing Parameters", "text": "1.6 Changing Parameters\nSome important parameters to hightlight in the anomalize() function include iqr_alpha.\n\n\n\n\n\n\nImportant\n\n\n\n\n\niqr_alpha controls the threshold for detecting outliers. It is the significance level used in the interquartile range (IQR) method for outlier detection. The default value is 0.05, which corresponds to a 5% significance level. A lower significance level will result in a higher threshold, which means fewer outliers will be detected. A higher significance level will result in a lower threshold, which means more outliers will be detected.\n\n\n\nLets visualize the effect of changing the iqr_alpha parameter;\n\nChanging iqr_alpha\nFirst, lets get a dataframe with multiple values for iqr_alpha;\n\n# Anomalized data with multiple iqr_alpha values\n\n# - Alpha values\niqr_alpha_values = [0.05, 0.10, 0.15, 0.20]\n\n# - Empty dataframes list\ndfs = []\n\nfor alpha in iqr_alpha_values:\n\n # - Run anomalize function\n anomalize_df = tk.anomalize(\n data = df,\n date_column = 'date',\n value_column = 'value',\n period = 7,\n iqr_alpha = alpha\n )\n\n # - Add the iqr_alpha column\n anomalize_df['iqr_alpha'] = f'iqr_alpha value of {alpha}'\n\n # - Append to the list\n dfs.append(anomalize_df)\n\n# - Concatenate all dataframes\nfinal_df = pd.concat(dfs)\n\nNow we can visualize the anomalies:\n\n\nVisualizing Grouped Anomalies (Facets)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n facet_ncol = 2\n )\n)\n\n\n \n\n\n\n\nVisualizing Grouped Anomalies (Plotly Dropdown)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n plotly_dropdown = True,\n plotly_dropdown_x = 1,\n plotly_dropdown_y = 0.60\n )\n)" }, + { + "objectID": "guides/03_pandas_frequency.html", + "href": "guides/03_pandas_frequency.html", + "title": "Pandas Frequencies", + "section": "", + "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the pandas frequency strings within pytimetk. Once you understand key frequencies, you can apply them to manipulate time series data like a pro.\n\n\n\n\n1 Pandas Frequencies\nPandas offers a variety of frequency strings, also known as offset aliases, to define the frequency of a time series. Here are some common frequency strings used in pandas:\n\nā€˜Bā€™: Business Day\nā€˜Dā€™: Calendar day\nā€˜Wā€™: Weekly\nā€˜Mā€™: Month end\nā€˜BMā€™: Business month end\nā€˜MSā€™: Month start\nā€˜BMSā€™: Business month start\nā€˜Qā€™: Quarter end\nā€˜BQā€™: Business quarter end\nā€˜QSā€™: Quarter start\nā€˜BQSā€™: Business quarter start\nā€˜Aā€™ or ā€˜Yā€™: Year end\nā€˜BAā€™ or ā€˜BYā€™: Business year end\nā€˜ASā€™ or ā€˜YSā€™: Year start\nā€˜BASā€™ or ā€˜BYSā€™: Business year start\nā€˜Hā€™: Hourly\nā€˜Tā€™ or ā€˜minā€™: Minutely\nā€˜Sā€™: Secondly\nā€˜Lā€™ or ā€˜msā€™: Milliseconds\nā€˜Uā€™: Microseconds\nā€˜Nā€™: Nanoseconds\n\n\nCustom Frequencies:\n\nYou can also create custom frequencies by combining base frequencies, like:\n\nā€˜2Dā€™: Every 2 days\nā€˜3Wā€™: Every 3 weeks\nā€˜4Hā€™: Every 4 hours\nā€˜1H30Tā€™: Every 1 hour and 30 minutes\n\n\n\n\nCompound Frequencies:\n\nYou can combine multiple frequencies by adding them together.\n\nā€˜1D1Hā€™: 1 day and 1 hour\nā€˜1H30Tā€™: 1 hour and 30 minutes\n\n\n\n\nExample:\n\n\nCode\nimport pandas as pd\n\n# Creating a date range with daily frequency\ndate_range_daily = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')\n\ndate_range_daily\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10'],\n dtype='datetime64[ns]', freq='D')\n\n\n\n\nCode\n# Creating a date range with 2 days frequency\ndate_range_two_days = pd.date_range(start='2023-01-01', end='2023-01-10', freq='2D')\n\ndate_range_two_days\n\n\nDatetimeIndex(['2023-01-01', '2023-01-03', '2023-01-05', '2023-01-07',\n '2023-01-09'],\n dtype='datetime64[ns]', freq='2D')\n\n\nThese frequency strings help in resampling, creating date ranges, and handling time-series data efficiently in pandas.\n\n\n\n2 Timetk Incorporates Pandas Frequencies\nNow that youā€™ve seen pandas frequencies, youā€™ll see them pop up in many of the pytimetk functions.\n\nExample: Padding Dates\nThis example shows how to use Pandas frequencies inside of pytimetk functions.\nWeā€™ll use pad_by_time to show how to use freq to fill in missing dates.\n\n\nCode\n# DataFrame with missing dates\nimport pandas as pd\n\ndata = {\n # '2023-09-05' is missing\n 'datetime': ['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-06'], \n 'value': [10, 30, 40, 50, 60]\n}\n\ndf = pd.DataFrame(data)\ndf['datetime'] = pd.to_datetime(df['datetime'])\ndf\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10\n\n\n1\n2023-09-02\n30\n\n\n2\n2023-09-03\n40\n\n\n3\n2023-09-04\n50\n\n\n4\n2023-09-06\n60\n\n\n\n\n\n\n\nWe can resample to fill in the missing day using pad_by_time with freq = 'D'.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = 'D')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10.0\n\n\n1\n2023-09-02\n30.0\n\n\n2\n2023-09-03\n40.0\n\n\n3\n2023-09-04\n50.0\n\n\n4\n2023-09-05\nNaN\n\n\n5\n2023-09-06\n60.0\n\n\n\n\n\n\n\nWhat about resampling every 12 hours? Just set `freq = ā€˜12Hā€™.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = '12H')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01 00:00:00\n10.0\n\n\n1\n2023-09-01 12:00:00\nNaN\n\n\n2\n2023-09-02 00:00:00\n30.0\n\n\n3\n2023-09-02 12:00:00\nNaN\n\n\n4\n2023-09-03 00:00:00\n40.0\n\n\n5\n2023-09-03 12:00:00\nNaN\n\n\n6\n2023-09-04 00:00:00\n50.0\n\n\n7\n2023-09-04 12:00:00\nNaN\n\n\n8\n2023-09-05 00:00:00\nNaN\n\n\n9\n2023-09-05 12:00:00\nNaN\n\n\n10\n2023-09-06 00:00:00\n60.0\n\n\n\n\n\n\n\nYouā€™ll see these pandas frequencies come up as the parameter freq in many pytimetk functions.\n\n\n\n3 Next Steps\nCheck out the Data Wrangling Guide next.\n\n\n4 More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." + }, + { + "objectID": "getting-started/01_installation.html", + "href": "getting-started/01_installation.html", + "title": "Install", + "section": "", + "text": "1 Quick Install\nLetā€™s get you up and running with pytimetk fast with the latest stable release.\npip install pytimetk\nYou can install from GitHub with this code.\npip install git+https://github.com/business-science/pytimetk.git\n\n\n2 Next steps\nCheck out the Quick Start Guide Next.\n\n\n3 More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." + }, + { + "objectID": "contributing.html", + "href": "contributing.html", + "title": "Contributing (Developer Setup)", + "section": "", + "text": "Interested in contributing?\n\n\n\n\n\nMake sure to Fork the GitHub Repo. Clone your fork. Then use poetry to install the pytimetk package.\n\n\n\n\n1 GitHub\nTo contribute, youā€™ll need to have a GitHub account. Then:\n\n1. Fork our pytimetk repository\nHead to our GitHub Repo and select ā€œforkā€. This makes a copied version of pytimetk for your personal use.\n\n\n2. Clone your forked version\nCloning will put your own personal version of pytimetk on your local machine. Make sure to replace [your_user_name] with your user name.\ngit clone https://github.com/[your_user_name]/pytimetk\n\n\n\n2 Poetry Environment Setup\nTo install pytimetk using Poetry, follow these steps:\n\n1. Prerequisites\nMake sure you have Python 3.9 or later installed on your system.\n\n\n2. Install Poetry\nTo install Poetry, you can use the official installer provided by Poetry. Do not use pip.\n\n\n3. Install Dependencies\nUse Poetry to install the package and its dependencies:\npoetry install\nor you can create a virtualenv with poetry and install the dependencies\npoetry shell\npoetry install\n\n\n\n3 Submit a Pull Request\n\n1. Make changes on a Branch\nMake changes in your local version on a branch where my-feature-branch is a branch youā€™d like to create that contains modifications.\ngit checkout -b my-feature-branch\n\n\n2. Push to your forked version of pytimetk\ngit push origin my-feature-branch\n\n\n3. Create a Pull Request\n\nGo to your forked repository on GitHub and switch to your branch.\nClick on ā€œNew pull requestā€ and compare the changes you made with the original repository.\nFill out the pull request template with the necessary information, explaining your changes, the reason for them, and any other relevant information.\n\n\n\n4. Submit the Pull Request\n\nReview your changes and submit the pull request.\n\n\n\n\n4 Next Steps šŸ»\nWe will review your PR. If all goes well, weā€™ll merge! And then youā€™ve just helped the community. šŸ»" + }, + { + "objectID": "index.html", + "href": "index.html", + "title": "PyTimeTK ", + "section": "", + "text": "Time series easier, faster, more fun. Pytimetk.\n\nPyTimetkā€™s Mission: To make time series analysis easier, faster, and more enjoyable in Python.\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\n\n1 Introducing pytimetk: Simplifying Time Series Analysis for Everyone\nTime series analysis is fundamental in many fields, from business forecasting to scientific research. While the Python ecosystem offers tools like pandas, they sometimes can be verbose and not optimized for all operations, especially for complex time-based aggregations and visualizations.\nEnter pytimetk. Crafted with a blend of ease-of-use and computational efficiency, pytimetk significantly simplifies the process of time series manipulation and visualization. By leveraging the polars backend, you can experience speed improvements ranging from 3X to a whopping 3500X. Letā€™s dive into a comparative analysis.\n\n\n\n\n\n\n\n\nFeatures/Properties\npytimetk\npandas (+matplotlib)\n\n\n\n\nSpeed\nšŸš€ 3X to 500X Faster\nšŸ¢ Standard\n\n\nCode Simplicity\nšŸŽ‰ Concise, readable syntax\nšŸ“œ Often verbose\n\n\nplot_timeseries()\nšŸŽØ 2 lines, no customization\nšŸŽØ 16 lines, customization needed\n\n\nsummarize_by_time()\nšŸ• 2 lines, 13.4X faster\nšŸ• 6 lines, 2 for-loops\n\n\npad_by_time()\nā›³ 2 lines, fills gaps in timeseries\nāŒ No equivalent\n\n\nanomalize()\nšŸ“ˆ 2 lines, detects and corrects anomalies\nāŒ No equivalent\n\n\naugment_timeseries_signature()\nšŸ“… 1 line, all calendar features\nšŸ• 30 lines of dt extractors\n\n\naugment_rolling()\nšŸŽļø 10X to 3500X faster\nšŸ¢ Slow Rolling Operations\n\n\n\nAs evident from the table, pytimetk is not just about speed; it also simplifies your codebase. For example, summarize_by_time(), converts a 6-line, double for-loop routine in pandas into a concise 2-line operation. And with the polars engine, get results 13.4X faster than pandas!\nSimilarly, plot_timeseries() dramatically streamlines the plotting process, encapsulating what would typically require 16 lines of matplotlib code into a mere 2-line command in pytimetk, without sacrificing customization or quality. And with plotly and plotnine engines, you can create interactive plots and beautiful static visualizations with just a few lines of code.\nFor calendar features, pytimetk offers augment_timeseries_signature() which cuts down on over 30 lines of pandas dt extractions. For rolling features, pytimetk offers augment_rolling(), which is 10X to 3500X faster than pandas. It also offers pad_by_time() to fill gaps in your time series data, and anomalize() to detect and correct anomalies in your time series data.\nJoin the revolution in time series analysis. Reduce your code complexity, increase your productivity, and harness the speed that pytimetk brings to your workflows.\nExplore more at our pytimetk homepage.\n\n\n2 šŸš€ Installation\nInstall the Latest Stable Version:\npip install pytimetk\nAlternatively, install the Development GitHub Version:\npip install git+https://github.com/business-science/pytimetk.git\n\n\n3 šŸ Quick Start: A Monthly Sales Analysis\nThis is a simple exercise to showcase the power of summarize_by_time():\n\nImport Libraries & Data\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns\n\n\n\n\n\nUsing summarize_by_time() for a Sales Analysis\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default). The default engine is \"pandas\". Selecting engine = \"polars\" allows us to improve the speed of the function.\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False,\n engine = \"polars\"\n )\n\n# Quickly examine each column\nsummary_category_1_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 24 rows of 3 columns\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\norder_date: datetime64[ns] [Timestamp('2011-01-01 00:00:00'), T ...\ntotal_price_sum: int64 [221490, 660555, 358855, 1075975, 45 ...\n\n\n\n\nVisualizing Sales Patterns\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\nThe default engine in ā€œplotnineā€ for static plotting. Setting the engine = \"plotly\" returns an interactive plot.\n\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n smooth_frac = 0.8,\n engine = \"plotly\"\n )\n\n\n \n\n\n\n\n\n4 šŸ“š Documentation\nNext step? Learn more with the pytimetk documentation\n\nšŸ“ˆ Overview\nšŸ Getting Started\nšŸ—ŗļø Beginner Guides\nšŸ“˜Applied Data Science Tutorials with PyTimeTK\nšŸŽļøSpeed Comparisons\nšŸ“„ API Reference\n\n\n\n5 šŸ» Contributing\nInterested in helping us make this the best Python package for time series analysis? Weā€™d love your help.\nFollow these instructions to Contribute.\n\n\n6 šŸ† More Coming Soonā€¦\nWe are in the early stages of development. But itā€™s obvious the potential for pytimetk now in Python. šŸ\n\nPlease ā­ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." + }, + { + "objectID": "getting-started/02_quick_start.html", + "href": "getting-started/02_quick_start.html", + "title": "Quick Start", + "section": "", + "text": "This is a simple exercise to showcase the power of our 2 most popular function:\n\nsummarize_by_time()\nplot_timeseries()\n\n\n\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns\n\n\n\n\n\n\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" + }, + { + "objectID": "getting-started/02_quick_start.html#import-libraries-data", + "href": "getting-started/02_quick_start.html#import-libraries-data", + "title": "Quick Start", + "section": "", + "text": "First, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the ā€œbike_sales_sampleā€ dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains ā€œorderlinesā€ for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g.Ā total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows Ɨ 13 columns" + }, + { + "objectID": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", + "href": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", + "title": "Quick Start", + "section": "", + "text": "Your company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandasā€™s groupby() method to group the DataFrame on category_1\nNext, use timetkā€™s summarize_by_time() method to apply the sum function my month start (ā€œMSā€) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440" + }, + { + "objectID": "getting-started/02_quick_start.html#visualizing-sales-patterns", + "href": "getting-started/02_quick_start.html#visualizing-sales-patterns", + "title": "Quick Start", + "section": "", + "text": "Now available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" + }, + { + "objectID": "guides/02_timetk_concepts.html", + "href": "guides/02_timetk_concepts.html", + "title": "PyTimeTK Basics", + "section": "", + "text": "PyTimeTK has one mission: To make time series analysis simpler, easier, and faster in Python. This goal requires some opinionated ways of treating time series in Python. We will conceptually lay out how pytimetk can help.\nLetā€™s first start with how to think about time series data conceptually. Time series data has 3 core properties." + }, + { + "objectID": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", + "href": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", + "title": "PyTimeTK Basics", + "section": "2.1 Type 1: Pandas DataFrame Operations", + "text": "2.1 Type 1: Pandas DataFrame Operations\nBefore we start using pytimetk, letā€™s make sure our data is set up properly.\n\nTimetk Data Format Compliance\n\n\n\n\n\n\n3 Core Properties Must Be Upheald\n\n\n\n\n\nA pytimetk-Compliant Pandas DataFrame must have:\n\nTime Series Index: A Time Stamp column containing datetime64 values\nValue Column(s): The value column(s) containing float or int values\nGroup Column(s): Optionally for grouped time series analysis, one or more columns containg str or categorical values (shown as an object)\n\nIf these are NOT upheld, this will impact your ability to use pytimetk DataFrame operations.\n\n\n\n\n\n\n\n\n\nInspect the DataFrame\n\n\n\n\n\nUse the tk.glimpse() method to check compliance.\n\n\n\nUsing pytimetk glimpse() method, we can see that we have a compliant data frame with a date column containing datetime64 and a value column containing float64. For grouped analysis we have the id column containing object dtype.\n\n\nCode\n# Tip: Inspect for compliance with glimpse()\nm4_daily_df.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 9743 rows of 3 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Timestamp(' ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4, 2017. ...\n\n\n\n\nGrouped Time Series Analysis with Summarize By Time\nFirst, inspect how the summarize_by_time function works by calling help().\n\n\nCode\n# Review the summarize_by_time documentation (output not shown)\nhelp(tk.summarize_by_time)\n\n\n\n\n\n\n\n\nHelp Doc Info: summarize_by_time()\n\n\n\n\n\n\nThe first parameter is data, indicating this is a DataFrame operation.\nThe Examples show different use cases for how to apply the function on a DataFrame\n\n\n\n\nLetā€™s test the summarize_by_time() DataFrame operation out using the grouped approach with method chaining. DataFrame operations can be used as Pandas methods with method-chaining, which allows us to more succinctly apply time series operations.\n\n\nCode\n# Grouped Summarize By Time with Method Chaining\ndf_summarized = (\n m4_daily_df\n .groupby('id')\n .summarize_by_time(\n date_column = 'date',\n value_column = 'value',\n freq = 'QS', # QS = Quarter Start\n agg_func = [\n 'mean', \n 'median', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max',\n ('range',lambda x: x.max() - x.min()),\n ],\n )\n)\n\ndf_summarized\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_q25\nvalue_q75\nvalue_max\nvalue_range\n\n\n\n\n0\nD10\n2014-07-01\n1960.078889\n1979.90\n1781.6\n1915.225\n2002.575\n2076.2\n294.6\n\n\n1\nD10\n2014-10-01\n2184.586957\n2154.05\n2022.8\n2125.075\n2274.150\n2344.9\n322.1\n\n\n2\nD10\n2015-01-01\n2309.830000\n2312.30\n2209.6\n2284.575\n2342.150\n2392.4\n182.8\n\n\n3\nD10\n2015-04-01\n2344.481319\n2333.00\n2185.1\n2301.750\n2391.000\n2499.8\n314.7\n\n\n4\nD10\n2015-07-01\n2156.754348\n2186.70\n1856.6\n1997.250\n2289.425\n2368.1\n511.5\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n105\nD500\n2011-07-01\n9727.321739\n9745.55\n8964.5\n9534.125\n10003.900\n10463.9\n1499.4\n\n\n106\nD500\n2011-10-01\n8175.565217\n7897.00\n6755.0\n7669.875\n8592.575\n9860.0\n3105.0\n\n\n107\nD500\n2012-01-01\n8291.317582\n8412.60\n7471.5\n7814.800\n8677.850\n8980.7\n1509.2\n\n\n108\nD500\n2012-04-01\n8654.020879\n8471.10\n8245.6\n8389.850\n9017.250\n9349.2\n1103.6\n\n\n109\nD500\n2012-07-01\n8770.502353\n8690.50\n8348.1\n8604.400\n8846.000\n9545.3\n1197.2\n\n\n\n\n110 rows Ɨ 9 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: summarize_by_time()\n\n\n\n\n\n\nThe data must comply with the 3 core properties (date column, value column(s), and group column(s))\nThe aggregation functions were applied by combination of group (id) and resample (Quarter Start)\nThe result was a pandas DataFrame with group column, resampled date column, and summary values (mean, median, min, 25th-quantile, etc)\n\n\n\n\n\n\nAnother DataFrame Example: Creating 29 Engineered Features\nLetā€™s examine another DataFrame function, tk.augment_timeseries_signature(). Feel free to inspect the documentation with help(tk.augment_timeseries_signature).\n\n\nCode\n# Creating 29 engineered features from the date column\n# Not run: help(tk.augment_timeseries_signature)\ndf_augmented = (\n m4_daily_df\n .augment_timeseries_signature(date_column = 'date')\n)\n\ndf_augmented.head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows Ɨ 32 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: augment_timeseries_signature()\n\n\n\n\n\n\nThe data must comply with the 1 of the 3 core properties (date column)\nThe result was a pandas DataFrame with 29 time series features that can be used for Machine Learning and Forecasting\n\n\n\n\n\n\nMaking Future Dates with Future Frame\nA common time series task before forecasting with machine learning models is to make a future DataFrame some length_out into the future. You can do this with tk.future_frame(). Hereā€™s how.\n\n\nCode\n# Preparing a time series data set for Machine Learning Forecasting\nfull_augmented_df = (\n m4_daily_df \n .groupby('id')\n .future_frame('date', length_out = 365)\n .augment_timeseries_signature('date')\n)\nfull_augmented_df\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n11203 rows Ɨ 32 columns\n\n\n\nWe can then get the future data by keying in on the data with value column that is missing (np.nan).\n\n\nCode\n# Get the future data (just the observations that haven't happened yet)\nfuture_df = (\n full_augmented_df\n .query('value.isna()')\n)\nfuture_df\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n9743\nD10\n2016-05-07\nNaN\n1462579200\n2016\n2016\n0\n0\n1\n1\n...\n7\n37\n128\n0\n0\n0\n0\n0\n0\nam\n\n\n9744\nD10\n2016-05-08\nNaN\n1462665600\n2016\n2016\n0\n0\n1\n1\n...\n8\n38\n129\n1\n0\n0\n0\n0\n0\nam\n\n\n9745\nD10\n2016-05-09\nNaN\n1462752000\n2016\n2016\n0\n0\n1\n1\n...\n9\n39\n130\n0\n0\n0\n0\n0\n0\nam\n\n\n9746\nD10\n2016-05-10\nNaN\n1462838400\n2016\n2016\n0\n0\n1\n1\n...\n10\n40\n131\n0\n0\n0\n0\n0\n0\nam\n\n\n9747\nD10\n2016-05-11\nNaN\n1462924800\n2016\n2016\n0\n0\n1\n1\n...\n11\n41\n132\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n1460 rows Ɨ 32 columns" + }, + { + "objectID": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", + "href": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", + "title": "PyTimeTK Basics", + "section": "2.2 Type 2: Pandas Series Operations", + "text": "2.2 Type 2: Pandas Series Operations\nThe main difference between a DataFrame operation and a Series operation is that we are operating on an array of values from typically one of the following dtypes:\n\nTimestamps (datetime64)\nNumeric (float64 or int64)\n\nThe first argument of Series operations that operate on Timestamps will always be idx.\nLetā€™s take a look at one shall we? Weā€™ll start with a common action: Making future time series from an existing time series with a regular frequency.\n\nThe Make Future Time Series Function\nSay we have a monthly sequence of timestamps. What if we want to create a forecast where we predict 12 months into the future? Well, we will need to create 12 future timestamps. Hereā€™s how.\nFirst create a pd.date_range() with dates starting at the beginning of each month.\n\n\nCode\n# Make a monthly date range\ndates_dt = pd.date_range(\"2023-01\", \"2024-01\", freq=\"MS\")\ndates_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',\n '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',\n '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01',\n '2024-01-01'],\n dtype='datetime64[ns]', freq='MS')\n\n\nNext, use tk.make_future_timeseries() to create the next 12 timestamps in the sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas Series: Future Dates\nfuture_series = pd.Series(dates_dt).make_future_timeseries(12)\nfuture_series\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# DateTimeIndex: Future Dates\nfuture_dt = tk.make_future_timeseries(\n idx = dates_dt,\n length_out = 12\n)\nfuture_dt\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\nWe can combine the actual and future timestamps into one combined timeseries.\n\n\nCode\n# Combining the 2 series and resetting the index\ncombined_timeseries = (\n pd.concat(\n [pd.Series(dates_dt), pd.Series(future_dt)],\n axis=0\n )\n .reset_index(drop = True)\n)\n\ncombined_timeseries\n\n\n0 2023-01-01\n1 2023-02-01\n2 2023-03-01\n3 2023-04-01\n4 2023-05-01\n5 2023-06-01\n6 2023-07-01\n7 2023-08-01\n8 2023-09-01\n9 2023-10-01\n10 2023-11-01\n11 2023-12-01\n12 2024-01-01\n13 2024-02-01\n14 2024-03-01\n15 2024-04-01\n16 2024-05-01\n17 2024-06-01\n18 2024-07-01\n19 2024-08-01\n20 2024-09-01\n21 2024-10-01\n22 2024-11-01\n23 2024-12-01\n24 2025-01-01\ndtype: datetime64[ns]\n\n\nNext, weā€™ll take a look at how to go from an irregular time series to a regular time series.\n\n\nFlooring Dates\nAn example is tk.floor_date, which is used to round down dates. See help(tk.floor_date).\nFlooring dates is often used as part of a strategy to go from an irregular time series to regular by combining with an aggregation. Often summarize_by_time() is used (Iā€™ll share why shortly). But conceptually, date flooring is the secret.\n\nWith FlooringWithout Flooring\n\n\n\n\nCode\n# Monthly flooring rounds dates down to 1st of the month\nm4_daily_df['date'].floor_date(unit = \"M\")\n\n\n0 2014-07-01\n1 2014-07-01\n2 2014-07-01\n3 2014-07-01\n4 2014-07-01\n ... \n9738 2014-07-01\n9739 2014-07-01\n9740 2014-07-01\n9741 2014-07-01\n9742 2014-07-01\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# Before Flooring\nm4_daily_df['date']\n\n\n0 2014-07-03\n1 2014-07-03\n2 2014-07-03\n3 2014-07-03\n4 2014-07-03\n ... \n9738 2014-07-03\n9739 2014-07-03\n9740 2014-07-03\n9741 2014-07-03\n9742 2014-07-03\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\nThis ā€œdate flooringā€ operation can be useful for creating date groupings.\n\n\nCode\n# Adding a date group with floor_date()\ndates_grouped_by_month = (\n m4_daily_df\n .assign(date_group = lambda x: x['date'].floor_date(\"M\"))\n)\n\ndates_grouped_by_month\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_group\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2014-07-01\n\n\n1\nD10\n2014-07-03\n2073.4\n2014-07-01\n\n\n2\nD10\n2014-07-03\n2048.7\n2014-07-01\n\n\n3\nD10\n2014-07-03\n2048.9\n2014-07-01\n\n\n4\nD10\n2014-07-03\n2006.4\n2014-07-01\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2014-07-03\n9418.8\n2014-07-01\n\n\n9739\nD500\n2014-07-03\n9365.7\n2014-07-01\n\n\n9740\nD500\n2014-07-03\n9445.9\n2014-07-01\n\n\n9741\nD500\n2014-07-03\n9497.9\n2014-07-01\n\n\n9742\nD500\n2014-07-03\n9545.3\n2014-07-01\n\n\n\n\n9743 rows Ɨ 4 columns\n\n\n\nWe can then do grouped operations.\n\n\nCode\n# Example of a grouped operation with floored dates\nsummary_df = (\n dates_grouped_by_month\n .drop('date', axis=1) \\\n .groupby(['id', 'date_group'])\n .mean() \\\n .reset_index()\n)\n\nsummary_df\n\n\n\n\n\n\n\n\n\nid\ndate_group\nvalue\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n\n\n1\nD160\n2014-07-01\n9243.155254\n\n\n2\nD410\n2014-07-01\n8259.786346\n\n\n3\nD500\n2014-07-01\n8287.728789\n\n\n\n\n\n\n\nOf course for this operation, we can do it faster with summarize_by_time() (and itā€™s much more flexible).\n\n\nCode\n# Summarize by time is less code and more flexible\n(\n m4_daily_df \n .groupby('id')\n .summarize_by_time(\n 'date', 'value', \n freq = \"MS\",\n agg_func = ['mean', 'median', 'min', 'max']\n )\n)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_max\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n2302.30\n1781.60\n2649.30\n\n\n1\nD160\n2014-07-01\n9243.155254\n10097.30\n1734.90\n19432.50\n\n\n2\nD410\n2014-07-01\n8259.786346\n8382.81\n6309.38\n9540.62\n\n\n3\nD500\n2014-07-01\n8287.728789\n7662.10\n4172.10\n14954.10\n\n\n\n\n\n\n\nAnd thatā€™s the core idea behind pytimetk, writing less code and getting more.\nNext, letā€™s do one more function. The brother of augment_timeseries_signature()ā€¦\n\n\nThe Get Time Series Signature Function\nThis function takes a pandas Series or DateTimeIndex and returns a DataFrame containing the 29 engineered features.\nStart with either a DateTimeIndexā€¦\n\n\nCode\ntimestamps_dt = pd.date_range(\"2023\", \"2024\", freq = \"D\")\ntimestamps_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10',\n ...\n '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-26',\n '2023-12-27', '2023-12-28', '2023-12-29', '2023-12-30',\n '2023-12-31', '2024-01-01'],\n dtype='datetime64[ns]', length=366, freq='D')\n\n\nā€¦ Or a Pandas Series.\n\n\nCode\ntimestamps_series = pd.Series(timestamps_dt)\ntimestamps_series\n\n\n0 2023-01-01\n1 2023-01-02\n2 2023-01-03\n3 2023-01-04\n4 2023-01-05\n ... \n361 2023-12-28\n362 2023-12-29\n363 2023-12-30\n364 2023-12-31\n365 2024-01-01\nLength: 366, dtype: datetime64[ns]\n\n\nAnd you can use the pandas Series function, tk.get_timeseries_signature() to create 29 features from the date sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas series: get_timeseries_signature\ntimestamps_series.get_timeseries_signature()\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows Ɨ 30 columns\n\n\n\n\n\n\n\nCode\n# DateTimeIndex: get_timeseries_signature\ntk.get_timeseries_signature(timestamps_dt)\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows Ɨ 30 columns" + }, + { + "objectID": "guides/01_visualization.html", + "href": "guides/01_visualization.html", + "title": "Data Visualization", + "section": "", + "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the plot_timeseries() for data visualization. Once you understand how it works, you can apply explore time series data easier than ever.\nThis tutorial focuses on, plot_timeseries(), a workhorse time-series plotting function that:" + }, + { + "objectID": "guides/01_visualization.html#plotting-a-single-time-series", + "href": "guides/01_visualization.html#plotting-a-single-time-series", + "title": "Data Visualization", + "section": "2.1 Plotting a Single Time Series", + "text": "2.1 Plotting a Single Time Series\nLetā€™s start with a popular time series, taylor_30_min, which includes energy demand in megawatts at a sampling interval of 30-minutes. This is a single time series.\n\n\nCode\n# Import a Time Series Data Set\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\ntaylor_30_min\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n\n\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n\n\n\n\n4032 rows Ɨ 2 columns\n\n\n\nThe plot_timeseries() function generates an interactive plotly chart by default.\n\nSimply provide the date variable (time-based column, date_column) and the numeric variable (value_column) that changes over time as the first 2 arguments.\nBy default, the plotting engine is plotly, which is interactive and excellent for data exploration and apps. However, if you require static plots for reports, you can set the engine to engine = ā€˜plotnineā€™ or engine = ā€˜matplotlibā€™.\n\nInteractive plot\n\n\nCode\ntaylor_30_min.plot_timeseries('date', 'value')\n\n\n\n \n\n\nStatic plot\n\n\nCode\ntaylor_30_min.plot_timeseries(\n 'date', 'value',\n engine = 'plotnine'\n)\n\n\n\n\n\n<Figure Size: (700 x 500)>" + }, + { + "objectID": "guides/01_visualization.html#plotting-groups", + "href": "guides/01_visualization.html#plotting-groups", + "title": "Data Visualization", + "section": "2.2 Plotting Groups", + "text": "2.2 Plotting Groups\nNext, letā€™s move on to a dataset with time series groups, m4_monthly, which is a sample of 4 time series from the M4 competition that are sampled at a monthly frequency.\n\n\nCode\n# Import a Time Series Data Set\nm4_monthly = tk.load_dataset(\"m4_monthly\", parse_dates = ['date'])\nm4_monthly\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nM1\n1976-06-01\n8000\n\n\n1\nM1\n1976-07-01\n8350\n\n\n2\nM1\n1976-08-01\n8570\n\n\n3\nM1\n1976-09-01\n7700\n\n\n4\nM1\n1976-10-01\n7080\n\n\n...\n...\n...\n...\n\n\n1569\nM1000\n2015-02-01\n880\n\n\n1570\nM1000\n2015-03-01\n800\n\n\n1571\nM1000\n2015-04-01\n1140\n\n\n1572\nM1000\n2015-05-01\n970\n\n\n1573\nM1000\n2015-06-01\n1430\n\n\n\n\n1574 rows Ɨ 3 columns\n\n\n\nVisualizing grouped data is as simple as grouping the data set with groupby() before run it into the plot_timeseries() function. There are 2 methods:\n\nFacets\nPlotly Dropdown\n\n\nFacets (Subgroups on one plot)\nThis is great to see all time series in one plot. Here are the key points:\n\nGroups can be added using the pandas groupby().\nThese groups are then converted into facets.\nUsing facet_ncol = 2 returns a 2-column faceted plot.\nSetting facet_scales = \"free\" allows the x and y-axes of each plot to scale independently of the other plots.\n\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n facet_ncol = 2, \n facet_scales = \"free\"\n)\n\n\n\n \n\n\n\n\nPlotly Dropdown\nSometimes you have many groups and would prefer to see one plot per group. This can be accomplished with plotly_dropdown. You can adjust the x and y position as follows:\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n plotly_dropdown=True,\n plotly_dropdown_x=0,\n plotly_dropdown_y=1\n)\n\n\n\n \n\n\nThe groups can also be vizualized in the same plot using color_column paramenter. Letā€™s come back to taylor_30_min dataframe.\n\n\nCode\n# load data\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\n\n# extract the month using pandas\ntaylor_30_min['month'] = pd.to_datetime(taylor_30_min['date']).dt.month\n\n# plot groups\ntaylor_30_min.plot_timeseries(\n 'date', 'value', \n color_column = 'month'\n)" + }, { "objectID": "guides/05_augmenting.html", "href": "guides/05_augmenting.html", diff --git a/docs/_site/sitemap.xml b/docs/_site/sitemap.xml index a62a7dea..2348c247 100644 --- a/docs/_site/sitemap.xml +++ b/docs/_site/sitemap.xml @@ -2,350 +2,354 @@ https://business-science.github.io/pytimetk/reference/ts_features.html - 2024-11-06T02:17:26.592Z + 2024-11-06T04:06:27.207Z https://business-science.github.io/pytimetk/reference/floor_date.html - 2024-11-06T02:17:23.752Z + 2024-11-06T04:06:24.296Z https://business-science.github.io/pytimetk/reference/ts_summary.html - 2024-11-06T02:17:21.443Z + 2024-11-06T04:06:21.750Z https://business-science.github.io/pytimetk/reference/timeseries_unit_frequency_table.html - 2024-11-06T02:17:18.802Z + 2024-11-06T04:06:18.937Z https://business-science.github.io/pytimetk/reference/transform_columns.html - 2024-11-06T02:17:17.103Z + 2024-11-06T04:06:17.208Z https://business-science.github.io/pytimetk/reference/augment_timeseries_signature.html - 2024-11-06T02:17:15.629Z + 2024-11-06T04:06:15.752Z https://business-science.github.io/pytimetk/reference/plot_anomalies.html - 2024-11-06T02:17:13.709Z + 2024-11-06T04:06:13.741Z https://business-science.github.io/pytimetk/reference/augment_ewm.html - 2024-11-06T02:17:11.023Z + 2024-11-06T04:06:10.896Z https://business-science.github.io/pytimetk/reference/make_future_timeseries.html - 2024-11-06T02:17:09.301Z + 2024-11-06T04:06:09.148Z https://business-science.github.io/pytimetk/reference/augment_roc.html - 2024-11-06T02:17:07.942Z + 2024-11-06T04:06:07.765Z https://business-science.github.io/pytimetk/reference/get_frequency_summary.html - 2024-11-06T02:17:06.337Z + 2024-11-06T04:06:06.111Z https://business-science.github.io/pytimetk/reference/augment_qsmomentum.html - 2024-11-06T02:17:04.634Z + 2024-11-06T04:06:04.379Z https://business-science.github.io/pytimetk/reference/augment_hilbert.html - 2024-11-06T02:17:02.496Z + 2024-11-06T04:06:02.148Z https://business-science.github.io/pytimetk/reference/progress_apply.html - 2024-11-06T02:16:59.750Z + 2024-11-06T04:05:59.291Z https://business-science.github.io/pytimetk/reference/flatten_multiindex_column_names.html - 2024-11-06T02:16:58.156Z + 2024-11-06T04:05:57.647Z https://business-science.github.io/pytimetk/reference/make_weekday_sequence.html - 2024-11-06T02:16:45.978Z + 2024-11-06T04:05:55.699Z https://business-science.github.io/pytimetk/reference/augment_pct_change.html - 2024-11-06T02:16:44.613Z + 2024-11-06T04:05:54.385Z https://business-science.github.io/pytimetk/reference/get_trend_frequency.html - 2024-11-06T02:16:41.896Z + 2024-11-06T04:05:51.553Z https://business-science.github.io/pytimetk/reference/augment_fourier.html - 2024-11-06T02:16:40.338Z + 2024-11-06T04:05:49.970Z https://business-science.github.io/pytimetk/reference/correlate.html - 2024-11-06T02:16:16.482Z + 2024-11-06T04:05:48.009Z https://business-science.github.io/pytimetk/reference/filter_by_time.html - 2024-11-06T02:16:14.373Z + 2024-11-06T04:05:45.857Z https://business-science.github.io/pytimetk/reference/summarize_by_time.html - 2024-11-06T02:16:11.976Z + 2024-11-06T04:05:43.444Z https://business-science.github.io/pytimetk/reference/drop_zero_variance.html - 2024-11-06T02:16:08.761Z + 2024-11-06T04:05:40.163Z https://business-science.github.io/pytimetk/reference/index.html - 2024-11-06T02:16:07.024Z + 2024-11-06T04:05:38.350Z https://business-science.github.io/pytimetk/reference/get_pandas_frequency.html - 2024-11-06T02:16:05.021Z + 2024-11-06T04:05:36.219Z https://business-science.github.io/pytimetk/reference/palette_timetk.html - 2024-11-06T02:16:03.568Z + 2024-11-06T04:05:34.753Z https://business-science.github.io/pytimetk/reference/get_date_summary.html - 2024-11-06T02:16:02.070Z + 2024-11-06T04:05:33.247Z https://business-science.github.io/pytimetk/reference/is_holiday.html - 2024-11-06T02:16:00.398Z + 2024-11-06T04:05:31.473Z https://business-science.github.io/pytimetk/reference/get_available_datasets.html - 2024-11-06T02:15:58.808Z + 2024-11-06T04:05:29.878Z https://business-science.github.io/pytimetk/reference/augment_wavelet.html - 2024-11-06T02:15:57.063Z + 2024-11-06T04:05:28.145Z https://business-science.github.io/pytimetk/reference/augment_cmo.html - 2024-11-06T02:15:54.877Z + 2024-11-06T04:05:25.860Z https://business-science.github.io/pytimetk/reference/augment_expanding.html - 2024-11-06T02:15:51.885Z + 2024-11-06T04:05:22.709Z https://business-science.github.io/pytimetk/reference/ceil_date.html - 2024-11-06T02:15:49.821Z + 2024-11-06T04:05:20.598Z https://business-science.github.io/pytimetk/reference/augment_rolling_apply.html - 2024-11-06T02:15:47.928Z + 2024-11-06T04:05:18.669Z https://business-science.github.io/pytimetk/reference/augment_rolling.html - 2024-11-06T02:15:45.551Z + 2024-11-06T04:05:16.091Z https://business-science.github.io/pytimetk/performance/01_speed_comparisons.html - 2024-11-06T02:15:43.076Z + 2024-11-06T04:05:13.479Z https://business-science.github.io/pytimetk/tutorials/01_sales_crm.html - 2024-11-06T02:15:33.832Z + 2024-11-06T04:05:04.339Z https://business-science.github.io/pytimetk/tutorials/05_clustering.html - 2024-11-06T02:15:25.874Z + 2024-11-06T04:04:56.757Z https://business-science.github.io/pytimetk/tutorials/04_anomaly_detection.html - 2024-11-06T02:15:24.081Z + 2024-11-06T04:04:54.880Z https://business-science.github.io/pytimetk/guides/04_wrangling.html - 2024-11-06T02:15:22.036Z + 2024-11-06T04:04:52.761Z - https://business-science.github.io/pytimetk/guides/01_visualization.html - 2024-11-06T02:15:16.637Z + https://business-science.github.io/pytimetk/guides/07_timeseries_crossvalidation.html + 2024-11-06T04:04:46.853Z - https://business-science.github.io/pytimetk/guides/02_timetk_concepts.html - 2024-11-06T02:15:14.326Z + https://business-science.github.io/pytimetk/guides/06_anomalize.html + 2024-11-06T04:04:25.699Z - https://business-science.github.io/pytimetk/getting-started/02_quick_start.html - 2024-11-06T02:15:09.441Z + https://business-science.github.io/pytimetk/guides/03_pandas_frequency.html + 2024-11-06T04:04:20.180Z - https://business-science.github.io/pytimetk/index.html - 2024-11-06T02:15:07.644Z + https://business-science.github.io/pytimetk/getting-started/01_installation.html + 2024-11-06T04:04:17.628Z https://business-science.github.io/pytimetk/contributing.html - 2024-11-06T02:15:06.410Z + 2024-11-06T04:04:15.441Z - https://business-science.github.io/pytimetk/getting-started/01_installation.html - 2024-11-06T02:15:08.343Z + https://business-science.github.io/pytimetk/index.html + 2024-11-06T04:04:16.888Z - https://business-science.github.io/pytimetk/guides/03_pandas_frequency.html - 2024-11-06T02:15:10.423Z + https://business-science.github.io/pytimetk/getting-started/02_quick_start.html + 2024-11-06T04:04:18.984Z - https://business-science.github.io/pytimetk/guides/06_anomalize.html - 2024-11-06T02:15:15.418Z + https://business-science.github.io/pytimetk/guides/02_timetk_concepts.html + 2024-11-06T04:04:24.521Z + + + https://business-science.github.io/pytimetk/guides/01_visualization.html + 2024-11-06T04:04:26.961Z https://business-science.github.io/pytimetk/guides/05_augmenting.html - 2024-11-06T02:15:20.041Z + 2024-11-06T04:04:50.441Z https://business-science.github.io/pytimetk/tutorials/06_correlationfunnel.html - 2024-11-06T02:15:25.125Z + 2024-11-06T04:04:56.039Z https://business-science.github.io/pytimetk/tutorials/03_demand_forecasting.html - 2024-11-06T02:15:29.467Z + 2024-11-06T04:05:00.096Z https://business-science.github.io/pytimetk/tutorials/02_finance.html - 2024-11-06T02:15:41.916Z + 2024-11-06T04:05:12.252Z https://business-science.github.io/pytimetk/changelog-news.html - 2024-11-06T02:15:44.115Z + 2024-11-06T04:05:14.548Z https://business-science.github.io/pytimetk/reference/plot_timeseries.html - 2024-11-06T02:15:46.943Z + 2024-11-06T04:05:17.609Z https://business-science.github.io/pytimetk/reference/augment_diffs.html - 2024-11-06T02:15:49.038Z + 2024-11-06T04:05:19.815Z https://business-science.github.io/pytimetk/reference/time_scale_template.html - 2024-11-06T02:15:50.577Z + 2024-11-06T04:05:21.411Z https://business-science.github.io/pytimetk/reference/pad_by_time.html - 2024-11-06T02:15:53.354Z + 2024-11-06T04:05:24.274Z https://business-science.github.io/pytimetk/reference/plot_anomalies_cleaned.html - 2024-11-06T02:15:55.902Z + 2024-11-06T04:05:26.909Z https://business-science.github.io/pytimetk/reference/plot_anomaly_decomp.html - 2024-11-06T02:15:58.035Z + 2024-11-06T04:05:29.147Z https://business-science.github.io/pytimetk/reference/get_seasonal_frequency.html - 2024-11-06T02:15:59.583Z + 2024-11-06T04:05:30.665Z https://business-science.github.io/pytimetk/reference/plot_correlation_funnel.html - 2024-11-06T02:16:01.332Z + 2024-11-06T04:05:32.468Z https://business-science.github.io/pytimetk/reference/glimpse.html - 2024-11-06T02:16:02.842Z + 2024-11-06T04:05:34.014Z https://business-science.github.io/pytimetk/reference/augment_ppo.html - 2024-11-06T02:16:04.533Z + 2024-11-06T04:05:35.728Z https://business-science.github.io/pytimetk/reference/augment_bbands.html - 2024-11-06T02:16:06.008Z + 2024-11-06T04:05:37.328Z https://business-science.github.io/pytimetk/reference/apply_by_time.html - 2024-11-06T02:16:08.268Z + 2024-11-06T04:05:39.680Z https://business-science.github.io/pytimetk/reference/plot_anomalies_decomp.html - 2024-11-06T02:16:09.793Z + 2024-11-06T04:05:41.206Z https://business-science.github.io/pytimetk/reference/parallel_apply.html - 2024-11-06T02:16:13.030Z + 2024-11-06T04:05:44.446Z https://business-science.github.io/pytimetk/reference/augment_expanding_apply.html - 2024-11-06T02:16:15.411Z + 2024-11-06T04:05:46.928Z https://business-science.github.io/pytimetk/reference/TimeSeriesCVSplitter.html - 2024-11-06T02:16:39.207Z + 2024-11-06T04:05:48.833Z https://business-science.github.io/pytimetk/reference/week_of_month.html - 2024-11-06T02:16:41.132Z + 2024-11-06T04:05:50.766Z https://business-science.github.io/pytimetk/reference/future_frame.html - 2024-11-06T02:16:43.539Z + 2024-11-06T04:05:53.261Z https://business-science.github.io/pytimetk/reference/get_frequency.html - 2024-11-06T02:16:45.155Z + 2024-11-06T04:05:54.890Z https://business-science.github.io/pytimetk/reference/TimeSeriesCV.html - 2024-11-06T02:16:57.347Z + 2024-11-06T04:05:56.874Z https://business-science.github.io/pytimetk/reference/theme_timetk.html - 2024-11-06T02:16:58.969Z + 2024-11-06T04:05:58.497Z https://business-science.github.io/pytimetk/reference/augment_rsi.html - 2024-11-06T02:17:01.235Z + 2024-11-06T04:06:00.808Z https://business-science.github.io/pytimetk/reference/augment_leads.html - 2024-11-06T02:17:03.796Z + 2024-11-06T04:06:03.533Z https://business-science.github.io/pytimetk/reference/binarize.html - 2024-11-06T02:17:05.527Z + 2024-11-06T04:06:05.313Z https://business-science.github.io/pytimetk/reference/load_dataset.html - 2024-11-06T02:17:06.885Z + 2024-11-06T04:06:06.686Z https://business-science.github.io/pytimetk/reference/get_diff_summary.html - 2024-11-06T02:17:08.489Z + 2024-11-06T04:06:08.293Z https://business-science.github.io/pytimetk/reference/make_weekend_sequence.html - 2024-11-06T02:17:10.120Z + 2024-11-06T04:06:09.972Z https://business-science.github.io/pytimetk/reference/get_holiday_signature.html - 2024-11-06T02:17:12.423Z + 2024-11-06T04:06:12.344Z https://business-science.github.io/pytimetk/reference/augment_atr.html - 2024-11-06T02:17:14.767Z + 2024-11-06T04:06:14.835Z https://business-science.github.io/pytimetk/reference/augment_macd.html - 2024-11-06T02:17:16.615Z + 2024-11-06T04:06:16.717Z https://business-science.github.io/pytimetk/reference/get_timeseries_signature.html - 2024-11-06T02:17:17.999Z + 2024-11-06T04:06:18.108Z https://business-science.github.io/pytimetk/reference/augment_lags.html - 2024-11-06T02:17:20.071Z + 2024-11-06T04:06:20.329Z https://business-science.github.io/pytimetk/reference/augment_holiday_signature.html - 2024-11-06T02:17:22.983Z + 2024-11-06T04:06:23.464Z https://business-science.github.io/pytimetk/reference/anomalize.html - 2024-11-06T02:17:25.624Z + 2024-11-06T04:06:26.246Z diff --git a/docs/_site/tutorials/01_sales_crm.html b/docs/_site/tutorials/01_sales_crm.html index a22436c6..7357a844 100644 --- a/docs/_site/tutorials/01_sales_crm.html +++ b/docs/_site/tutorials/01_sales_crm.html @@ -64,7 +64,7 @@ - + @@ -266,6 +266,12 @@ Anomaly Detection + + @@ -3464,8 +3470,8 @@

5 More Coming Soo