From fe80f108f4ab023fadd8da812c663fdd1a54f6ce Mon Sep 17 00:00:00 2001 From: Matt Dancho Date: Tue, 5 Nov 2024 21:17:49 -0500 Subject: [PATCH] TimeSeriesCVSplitter: finalize --- docs/_sidebar.yml | 1 + docs/_site/reference/TimeSeriesCV.html | 15 +- .../_site/reference/TimeSeriesCVSplitter.html | 1326 +++++++++ docs/_site/reference/index.html | 4 + docs/_site/search.json | 2361 +++++++++-------- docs/_site/sitemap.xml | 272 +- docs/objects.json | 2 +- docs/reference/TimeSeriesCV.qmd | 14 +- docs/reference/TimeSeriesCVSplitter.qmd | 137 + docs/reference/index.qmd | 1 + .../crossvalidation/time_series_cv.py | 88 +- 11 files changed, 2846 insertions(+), 1375 deletions(-) create mode 100644 docs/_site/reference/TimeSeriesCVSplitter.html create mode 100644 docs/reference/TimeSeriesCVSplitter.qmd diff --git a/docs/_sidebar.yml b/docs/_sidebar.yml index a17bcf0e..6ede687b 100644 --- a/docs/_sidebar.yml +++ b/docs/_sidebar.yml @@ -45,6 +45,7 @@ website: section: "\U0001F4CE TS Features" - contents: - reference/TimeSeriesCV.qmd + - reference/TimeSeriesCVSplitter.qmd section: "\U0001F4C8 Time Series Cross Validation (TSCV)" - contents: - reference/augment_macd.qmd diff --git a/docs/_site/reference/TimeSeriesCV.html b/docs/_site/reference/TimeSeriesCV.html index 4592b8dc..1d389095 100644 --- a/docs/_site/reference/TimeSeriesCV.html +++ b/docs/_site/reference/TimeSeriesCV.html @@ -484,8 +484,15 @@

Parameters

Raises:

-

ValueError: - If frequency is not one of “days”, “seconds”, “microseconds”, “milliseconds”, “minutes”, “hours”, “weeks”. - If window is not one of “rolling” or “expanding”. - If mode is not one of “forward” or “backward” - If train_size, forecast_horizon, gap or stride are not strictly positive.

-

TypeError: If train_size, forecast_horizon, gap or stride are not of type int.

+

ValueError:

+ +

TypeError:

+

If train_size, forecast_horizon, gap or stride are not of type int.

Examples:

@@ -672,9 +679,9 @@

Examples:

tscv.plot(y)
-
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ +
+
+

TimeSeriesCVSplitter

+
+ + + +
+ + + + +
+ + +
+ +

TimeSeriesCVSplitter(self, *, frequency, train_size, forecast_horizon, time_series, gap=0, stride=None, window='rolling', mode='backward', start_dt=None, end_dt=None, split_limit=None)

+

The TimeSeriesCVSplitter is a scikit-learn compatible cross-validator using TimeSeriesCV.

+

This cross-validator generates splits based on time values, making it suitable for time series data.

+
+

Parameters:

+

frequency: str The frequency of the time series (e.g., “days”, “hours”). train_size: int Minimum number of time units in the training set. forecast_horizon: int Number of time units to forecast in each split. time_series: pd.Series A pandas Series or Index representing the time values. gap: int Number of time units to skip between training and testing sets. stride: int Number of time units to move forward after each split. window: str Type of window, either “rolling” or “expanding”. mode: str Order of split generation, “forward” or “backward”. start_dt: pd.Timestamp Start date for the time period. end_dt: pd.Timestamp End date for the time period. split_limit: int Maximum number of splits to generate. If None, all possible splits will be generated.

+
+
+

Raises:

+

ValueError: If the input arrays are incompatible in length with the time series.

+
+
+

Returns:

+

A generator of tuples of arrays containing the training and forecast data.

+
+
+

See Also:

+

TimeSeriesCV

+
+
+

Examples

+
+
import pandas as pd
+import numpy as np
+
+from pytimetk import TimeSeriesCVSplitter
+
+start_dt = pd.Timestamp(2023, 1, 1)
+end_dt = pd.Timestamp(2023, 1, 31)
+
+time_series = pd.Series(pd.date_range(start_dt, end_dt, freq="D"))
+size = len(time_series)
+
+df = pd.DataFrame(data=np.random.randn(size, 2), columns=["a", "b"])
+
+X, y = df[["a", "b"]], df[["a", "b"]].sum(axis=1)
+
+cv = TimeSeriesCVSplitter(
+    time_series=time_series,
+    frequency="days",
+    train_size=14,
+    forecast_horizon=7,
+    gap=0,
+    stride=1,
+    window="rolling",
+)
+
+cv
+
+
TimeSeriesCVSplitter(end_dt=None, forecast_horizon=None, frequency=None,
+           gap=None, mode=None, split_limit=None, start_dt=None,
+           stride=None, time_series=None, train_size=None, window=None)
+
+
+
+
# Insepct the cross-validation splits
+cv.splitter.plot(y, time_series = time_series)
+
+ +
+
+
+
+
# Using the TimeSeriesCVSplitter in a scikit-learn CV model
+
+from sklearn.linear_model import Ridge
+from sklearn.model_selection import RandomizedSearchCV
+
+# Fit and get best estimator
+param_grid = {
+    "alpha": np.linspace(0.1, 2, 10),
+    "fit_intercept": [True, False],
+    "positive": [True, False],
+}
+
+random_search_cv = RandomizedSearchCV(
+    estimator=Ridge(),
+    param_distributions=param_grid,
+    cv=cv,
+    n_jobs=-1,
+).fit(X, y)
+
+random_search_cv.best_estimator_
+
+
Ridge(alpha=np.float64(0.1), fit_intercept=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
+
+
+
+
+

Methods

+ + + + + + + + + + + + + + + + + +
NameDescription
get_n_splitsReturns the number of splits.
splitGenerates train and test indices for cross-validation.
+
+

get_n_splits

+

TimeSeriesCVSplitter.get_n_splits(X=None, y=None, groups=None)

+

Returns the number of splits.

+
+
+

split

+

TimeSeriesCVSplitter.split(X=None, y=None, groups=None)

+

Generates train and test indices for cross-validation.

+
+

Parameters:

+

X: Optional input features (ignored, for compatibility with scikit-learn). y: Optional target variable (ignored, for compatibility with scikit-learn). groups: Optional group labels (ignored, for compatibility with scikit-learn).

+
+
+

Yields:

+

Tuple[np.ndarray, np.ndarray]: Tuples of train and test indices.

+ + +
+
+
+ +
+ +
+ + + + \ No newline at end of file diff --git a/docs/_site/reference/index.html b/docs/_site/reference/index.html index 26c16bda..600cdb05 100644 --- a/docs/_site/reference/index.html +++ b/docs/_site/reference/index.html @@ -526,6 +526,10 @@

📈 Tim TimeSeriesCV TimeSeriesCV is a subclass of TimeBasedSplit with default mode set to ‘backward’ + +TimeSeriesCVSplitter +The TimeSeriesCVSplitter is a scikit-learn compatible cross-validator using TimeSeriesCV. +

diff --git a/docs/_site/search.json b/docs/_site/search.json index d0cd8ca9..83e50c12 100644 --- a/docs/_site/search.json +++ b/docs/_site/search.json @@ -588,900 +588,760 @@ "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('m4_daily', parse_dates=['date'])\n\n# Example 1 - Add Fourier transforms for a single column\nfourier_df = (\n df\n .query(\"id == 'D10'\")\n .augment_fourier(\n date_column='date',\n periods=[1, 7],\n max_order=1\n )\n)\nfourier_df.head()\n\nfourier_df.plot_timeseries(\"date\", \"date_sin_1_7\", x_axis_date_labels = \"%B %d, %Y\",)\n\n\n \n\n\n\n# Example 2 - Add Fourier transforms for grouped data\nfourier_df = (\n df\n .groupby(\"id\")\n .augment_fourier(\n date_column='date',\n periods=[1, 7],\n max_order=1,\n engine= \"pandas\"\n )\n)\nfourier_df\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_sin_1_1\ndate_sin_1_7\ndate_cos_1_1\ndate_cos_1_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.199951\n0.394510\n0.057901\n-0.918892\n-0.998322\n\n\n1\nD10\n2014-07-04\n2073.399902\n-0.980653\n0.645246\n-0.195753\n-0.763975\n\n\n2\nD10\n2014-07-05\n2048.699951\n0.011484\n0.974562\n0.999934\n-0.224120\n\n\n3\nD10\n2014-07-06\n2048.899902\n0.975899\n0.914158\n-0.218224\n0.405359\n\n\n4\nD10\n2014-07-07\n2006.400024\n-0.415510\n0.488189\n-0.909589\n0.872738\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.799805\n-0.953668\n0.999047\n0.300860\n-0.043643\n\n\n9739\nD500\n2012-09-20\n9365.700195\n0.491755\n0.825474\n0.870734\n0.564440\n\n\n9740\nD500\n2012-09-21\n9445.900391\n0.750080\n0.321800\n-0.661347\n0.946808\n\n\n9741\nD500\n2012-09-22\n9497.900391\n-0.802291\n-0.310559\n-0.596933\n0.950554\n\n\n9742\nD500\n2012-09-23\n9545.299805\n-0.417929\n-0.818728\n0.908480\n0.574181\n\n\n\n\n9743 rows × 7 columns\n\n\n\n\n# Example 3 - Add Fourier transforms for grouped data\nfourier_df = (\n df\n .groupby(\"id\")\n .augment_fourier(\n date_column='date',\n periods=[1, 7],\n max_order=1,\n engine= \"polars\"\n )\n)\nfourier_df\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_sin_1_1\ndate_sin_1_7\ndate_cos_1_1\ndate_cos_1_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.199951\n0.394510\n0.057901\n-0.918892\n-0.998322\n\n\n1\nD10\n2014-07-04\n2073.399902\n-0.980653\n0.645246\n-0.195753\n-0.763975\n\n\n2\nD10\n2014-07-05\n2048.699951\n0.011484\n0.974562\n0.999934\n-0.224120\n\n\n3\nD10\n2014-07-06\n2048.899902\n0.975899\n0.914158\n-0.218224\n0.405359\n\n\n4\nD10\n2014-07-07\n2006.400024\n-0.415510\n0.488189\n-0.909589\n0.872738\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.799805\n-0.953668\n0.999047\n0.300860\n-0.043643\n\n\n9739\nD500\n2012-09-20\n9365.700195\n0.491755\n0.825474\n0.870734\n0.564440\n\n\n9740\nD500\n2012-09-21\n9445.900391\n0.750080\n0.321800\n-0.661347\n0.946808\n\n\n9741\nD500\n2012-09-22\n9497.900391\n-0.802291\n-0.310559\n-0.596933\n0.950554\n\n\n9742\nD500\n2012-09-23\n9545.299805\n-0.417929\n-0.818728\n0.908480\n0.574181\n\n\n\n\n9743 rows × 7 columns" }, { - "objectID": "reference/augment_expanding_apply.html", - "href": "reference/augment_expanding_apply.html", - "title": "augment_expanding_apply", + "objectID": "reference/correlate.html", + "href": "reference/correlate.html", + "title": "correlate", "section": "", - "text": "augment_expanding_apply(data, date_column, window_func, min_periods=None, threads=1, show_progress=True, reduce_memory=False)\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame." + "text": "correlate(data, target, method='pearson')\nThe correlate function calculates the correlation between a target variable and all other variables in a pandas DataFrame, and returns the results sorted by absolute correlation in descending order." }, { - "objectID": "reference/augment_expanding_apply.html#parameters", - "href": "reference/augment_expanding_apply.html#parameters", - "title": "augment_expanding_apply", + "objectID": "reference/correlate.html#parameters", + "href": "reference/correlate.html#parameters", + "title": "correlate", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nwindow_func\nUnion[Tuple[str, Callable], List[Tuple[str, Callable]]]\nThe window_func parameter in the augment_expanding_apply function specifies the function(s) that operate on a expanding window with the consideration of multiple columns. The specification can be: - A tuple where the first element is a string representing the function’s name and the second element is the callable function itself. - A list of such tuples for multiple functions. Note: For functions targeting only a single value column without the need for contextual data from other columns, consider using the augment_expanding function in this library.\nrequired\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that you want to calculate correlations for. It can be either a pandas DataFrame or a grouped DataFrame obtained from a groupby operation.\nrequired\n\n\ntarget\nstr\nThe target parameter is a string that represents the column name in the DataFrame for which you want to calculate the correlation with other columns.\nrequired\n\n\nmethod\nstr\nThe method parameter in the correlate function is used to specify the method for calculating the correlation coefficient. The available options for the method parameter are: * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation\n= 'pearson'" }, { - "objectID": "reference/augment_expanding_apply.html#returns", - "href": "reference/augment_expanding_apply.html#returns", - "title": "augment_expanding_apply", + "objectID": "reference/correlate.html#returns", + "href": "reference/correlate.html#returns", + "title": "correlate", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_expanding function returns a DataFrame with new columns for each applied function, window size, and value column." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function correlate returns a DataFrame with two columns: ‘feature’ and ‘correlation’. The\n‘feature’ column contains the names of the features in the input data, and the ‘correlation’ column contains the correlation coefficients between each feature and the target variable. The DataFrame is sorted in descending order based on the absolute correlation values." }, { - "objectID": "reference/augment_expanding_apply.html#examples", - "href": "reference/augment_expanding_apply.html#examples", - "title": "augment_expanding_apply", + "objectID": "reference/correlate.html#see-also", + "href": "reference/correlate.html#see-also", + "title": "correlate", + "section": "See Also", + "text": "See Also\n\nbinarize() : Prepares data for correlate, which is used for analyzing correlationfunnel plots." + }, + { + "objectID": "reference/correlate.html#examples", + "href": "reference/correlate.html#examples", + "title": "correlate", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n\n# Example showcasing the expanding correlation between two columns (`value1` and \n# `value2`).\n# The correlation requires both columns as input.\n \n# Sample DataFrame with id, date, value1, and value2 columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [2, 16, 20, 40, 41, 50],\n})\n \n# Compute the expanding correlation for each group of 'id'\nexpanding_df = (\n df.groupby('id')\n .augment_expanding_apply(\n date_column='date',\n window_func=[('corr', lambda x: x['value1'].corr(x['value2']))], # Lambda function for correlation\n threads = 1, # Disable parallel processing\n )\n)\ndisplay(expanding_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nexpanding_corr\n\n\n\n\n0\n1\n2023-01-01\n10\n2\nNaN\n\n\n1\n1\n2023-01-02\n20\n16\n1.000000\n\n\n2\n1\n2023-01-03\n29\n20\n0.961054\n\n\n3\n2\n2023-01-04\n42\n40\nNaN\n\n\n4\n2\n2023-01-05\n53\n41\n1.000000\n\n\n5\n2\n2023-01-06\n59\n50\n0.824831\n\n\n\n\n\n\n\n\n# expanding Regression Example: Using `value1` as the dependent variable and \n# `value2` and `value3` as the independent variables.\n# This example demonstrates how to perform a expanding regression using two \n# independent variables.\n\n# Sample DataFrame with `id`, `date`, `value1`, `value2`, and `value3` columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [5, 16, 24, 35, 45, 58],\n 'value3': [2, 3, 6, 9, 10, 13]\n})\n \n# Define Regression Function to be applied on the expanding window.\ndef regression(df):\n \n # Required module (scikit-learn) for regression.\n from sklearn.linear_model import LinearRegression\n \n model = LinearRegression()\n X = df[['value2', 'value3']] # Independent variables\n y = df['value1'] # Dependent variable\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n \n# Compute the expanding regression for each group of `id`\nresult_df = (\n df.groupby('id')\n .augment_expanding_apply(\n date_column='date',\n window_func=[('regression', regression)],\n threads = 1\n )\n .dropna()\n)\n\n# Format the results to have each regression output (slope and intercept) in \n# separate columns.\nregression_wide_df = pd.concat(result_df['expanding_regression'].to_list(), axis=1).T\nregression_wide_df = pd.concat([result_df.reset_index(drop = True), regression_wide_df], axis=1)\ndisplay(regression_wide_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nvalue3\nexpanding_regression\nIntercept\nSlope\n\n\n\n\n0\n1\n2023-01-01\n10\n5\n2\nIntercept 10.0 Slope 0.0 dtype: flo...\n10.000000\n0.000000\n\n\n1\n1\n2023-01-02\n20\n16\n3\nIntercept 5.327869 Slope 0.901639 dt...\n5.327869\n0.901639\n\n\n2\n1\n2023-01-03\n29\n24\n6\nIntercept 4.28 Slope 0.84 dtype: flo...\n4.280000\n0.840000\n\n\n3\n2\n2023-01-04\n42\n35\n9\nIntercept 42.0 Slope 0.0 dtype: flo...\n42.000000\n0.000000\n\n\n4\n2\n2023-01-05\n53\n45\n10\nIntercept 2.900990 Slope 1.089109 dt...\n2.900990\n1.089109\n\n\n5\n2\n2023-01-06\n59\n58\n13\nIntercept 30.352941 Slope 1.588235 ...\n30.352941\n1.588235" + "text": "Examples\n\n# NON-TIMESERIES EXAMPLE ----\n\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\n# Set a random seed for reproducibility\nnp.random.seed(0)\n\n# Define the number of rows for your DataFrame\nnum_rows = 200\n\n# Create fake data for the columns\ndata = {\n 'Age': np.random.randint(18, 65, size=num_rows),\n 'Gender': np.random.choice(['Male', 'Female'], size=num_rows),\n 'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),\n 'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),\n 'Years_Playing': np.random.randint(0, 30, size=num_rows),\n 'Average_Income': np.random.randint(20000, 100000, size=num_rows),\n 'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),\n 'Number_Children': np.random.randint(0, 5, size=num_rows),\n 'Own_House_Flag': np.random.choice([True, False], size=num_rows),\n 'Own_Car_Count': np.random.randint(0, 3, size=num_rows),\n 'PersonId': range(1, num_rows + 1), # Add a PersonId column as a row count\n 'Client': np.random.choice(['A', 'B'], size=num_rows) # Add a Client column with random values 'A' or 'B'\n}\n\n# Create a DataFrame\ndf = pd.DataFrame(data)\n\n# Binarize the data\ndf_binarized = df.binarize(n_bins=4, thresh_infreq=0.01, name_infreq=\"-OTHER\", one_hot=True)\n\ndf_binarized.glimpse() \n\n<class 'pandas.core.frame.DataFrame'>: 200 rows of 42 columns\nAge__18.0_29.0: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nAge__29.0_39.0: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nAge__39.0_53.0: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nAge__53.0_64.0: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nYears_Playing__0.0_7.0: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nYears_Playing__7.0_15.0: uint8 [0, 0, 1, 0, 1, 0, 1 ...\nYears_Playing__15.0_22.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nYears_Playing__22.0_29.0: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nAverage_Income__20131.0_40110.2: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nAverage_Income__40110.2_60649.5: uint8 [0, 0, 0, 1, 1, 0, 1 ...\nAverage_Income__60649.5_79904.8: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nAverage_Income__79904.8_99131.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nPersonId__1.0_50.8: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nPersonId__50.8_100.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__100.5_150.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__150.2_200.0: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nGender__Female: uint8 [1, 0, 0, 0, 1, 0, 1 ...\nGender__Male: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nMarital_Status__Divorced: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nMarital_Status__Married: uint8 [1, 1, 0, 0, 1, 0, 0 ...\nMarital_Status__Single: uint8 [0, 0, 1, 1, 0, 1, 1 ...\nCity__Chicago: uint8 [0, 0, 1, 0, 0, 1, 0 ...\nCity__Houston: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nCity__Los Angeles: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nCity__Miami: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nCity__New York: uint8 [1, 0, 0, 1, 1, 0, 0 ...\nMember_Status__Bronze: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nMember_Status__Gold: uint8 [0, 0, 0, 0, 0, 1, 1 ...\nMember_Status__Platinum: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nMember_Status__Silver: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nNumber_Children__0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nNumber_Children__1: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nNumber_Children__2: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nNumber_Children__3: uint8 [0, 1, 0, 0, 0, 1, 0 ...\nNumber_Children__4: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nOwn_House_Flag__0: uint8 [1, 1, 0, 0, 1, 0, 1 ...\nOwn_House_Flag__1: uint8 [0, 0, 1, 1, 0, 1, 0 ...\nOwn_Car_Count__0: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nOwn_Car_Count__1: uint8 [0, 0, 0, 1, 0, 1, 1 ...\nOwn_Car_Count__2: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nClient__A: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nClient__B: uint8 [0, 0, 0, 0, 0, 0, 0 ...\n\n\n\ndf_correlated = df_binarized.correlate(target='Member_Status__Platinum')\ndf_correlated\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n28\nMember_Status\nPlatinum\n1.000000\n\n\n26\nMember_Status\nBronze\n-0.341351\n\n\n29\nMember_Status\nSilver\n-0.332799\n\n\n27\nMember_Status\nGold\n-0.298637\n\n\n30\nNumber_Children\n0\n0.205230\n\n\n8\nAverage_Income\n20131.0_40110.2\n-0.156593\n\n\n0\nAge\n18.0_29.0\n-0.135522\n\n\n11\nAverage_Income\n79904.8_99131.0\n0.115743\n\n\n33\nNumber_Children\n3\n-0.112216\n\n\n7\nYears_Playing\n22.0_29.0\n-0.106763\n\n\n19\nMarital_Status\nMarried\n-0.104562\n\n\n41\nClient\nB\n0.103842\n\n\n40\nClient\nA\n-0.103842\n\n\n9\nAverage_Income\n40110.2_60649.5\n0.088509\n\n\n12\nPersonId\n1.0_50.8\n0.088509\n\n\n38\nOwn_Car_Count\n1\n0.087769\n\n\n22\nCity\nHouston\n0.086124\n\n\n13\nPersonId\n50.8_100.5\n-0.074892\n\n\n2\nAge\n39.0_53.0\n0.074739\n\n\n39\nOwn_Car_Count\n2\n-0.071738\n\n\n31\nNumber_Children\n1\n-0.069054\n\n\n25\nCity\nNew York\n-0.055757\n\n\n18\nMarital_Status\nDivorced\n0.055724\n\n\n1\nAge\n29.0_39.0\n0.054374\n\n\n20\nMarital_Status\nSingle\n0.050286\n\n\n34\nNumber_Children\n4\n-0.047760\n\n\n15\nPersonId\n150.2_200.0\n-0.047659\n\n\n10\nAverage_Income\n60649.5_79904.8\n-0.047659\n\n\n5\nYears_Playing\n7.0_15.0\n0.040717\n\n\n14\nPersonId\n100.5_150.2\n0.034042\n\n\n6\nYears_Playing\n15.0_22.0\n0.034042\n\n\n21\nCity\nChicago\n-0.032799\n\n\n4\nYears_Playing\n0.0_7.0\n0.028391\n\n\n16\nGender\nFemale\n0.020215\n\n\n17\nGender\nMale\n-0.020215\n\n\n35\nOwn_House_Flag\n0\n0.017336\n\n\n36\nOwn_House_Flag\n1\n-0.017336\n\n\n37\nOwn_Car_Count\n0\n-0.016373\n\n\n3\nAge\n53.0_64.0\n0.012002\n\n\n24\nCity\nMiami\n0.010662\n\n\n23\nCity\nLos Angeles\n-0.004911\n\n\n32\nNumber_Children\n2\n0.002104\n\n\n\n\n\n\n\n\n# Interactive\ndf_correlated.plot_correlation_funnel(\n engine='plotly', \n height=400\n)\n\n\n \n\n\n\n# Static\nfig = df_correlated.plot_correlation_funnel(\n engine ='plotnine', \n height = 600\n)\nfig\n\n\n\n\n<Figure Size: (700 x 600)>" }, { - "objectID": "reference/parallel_apply.html", - "href": "reference/parallel_apply.html", - "title": "parallel_apply", + "objectID": "reference/filter_by_time.html", + "href": "reference/filter_by_time.html", + "title": "filter_by_time", "section": "", - "text": "parallel_apply(data, func, show_progress=True, threads=None, desc='Processing...', **kwargs)\nThe parallel_apply function parallelizes the application of a function on grouped dataframes using concurrent.futures." + "text": "filter_by_time(data, date_column, start_date='start', end_date='end', engine='pandas')\nFilters a DataFrame or GroupBy object based on a specified date range.\nThis function filters data in a pandas DataFrame or a pandas GroupBy object by a given date range. It supports various date formats and can handle both DataFrame and GroupBy objects." }, { - "objectID": "reference/parallel_apply.html#parameters", - "href": "reference/parallel_apply.html#parameters", - "title": "parallel_apply", + "objectID": "reference/filter_by_time.html#parameters", + "href": "reference/filter_by_time.html#parameters", + "title": "filter_by_time", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.core.groupby.generic.DataFrameGroupBy\nThe data parameter is a Pandas DataFrameGroupBy object, which is the result of grouping a DataFrame by one or more columns. It represents the grouped data that you want to apply the function to.\nrequired\n\n\nfunc\nCallable\nThe func parameter is the function that you want to apply to each group in the grouped dataframe. This function should take a single argument, which is a dataframe representing a group, and return a result. The result can be a scalar value, a pandas Series, or a pandas DataFrame.\nrequired\n\n\nshow_progress\nbool\nA boolean parameter that determines whether to display progress using tqdm. If set to True, progress will be displayed. If set to False, progress will not be displayed.\nTrue\n\n\nthreads\nint\nThe threads parameter specifies the number of threads to use for parallel processing. If threads is set to None, it will use all available processors. If threads is set to -1, it will use all available processors as well.\nNone\n\n\n**kwargs\n\nThe **kwargs parameter is a dictionary of keyword arguments that are passed to the func function.\n{}" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data to be filtered. It can be a pandas DataFrame or a pandas GroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in data that contains date information. This column is used for filtering the data based on the date range.\nrequired\n\n\nstart_date\nstr\nThe start date of the filtering range. The format of the date can be YYYY, YYYY-MM, YYYY-MM-DD, YYYY-MM-DD HH, YYYY-MM-DD HH:SS, or YYYY-MM-DD HH:MM:SS. Default: ‘start’, which will filter from the earliest date in the data.\n'start'\n\n\nend_date\nstr\nThe end date of the filtering range. It supports the same formats as start_date. Default: ‘end’, which will filter until the latest date in the data.\n'end'\n\n\nengine\nstr\nThe engine to be used for filtering the data. Currently, only ‘pandas’.\n= 'pandas'" }, { - "objectID": "reference/parallel_apply.html#returns", - "href": "reference/parallel_apply.html#returns", - "title": "parallel_apply", + "objectID": "reference/filter_by_time.html#returns", + "href": "reference/filter_by_time.html#returns", + "title": "filter_by_time", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe parallel_apply function returns a combined result after applying the specified function on all groups in the grouped dataframe. The result can be a pandas DataFrame or a pandas Series, depending on the function applied." - }, - { - "objectID": "reference/parallel_apply.html#examples", - "href": "reference/parallel_apply.html#examples", - "title": "parallel_apply", - "section": "Examples:", - "text": "Examples:\n\n# Example 1 - Single argument returns Series\n\nimport pytimetk as tk\nimport pandas as pd \n\ndf = pd.DataFrame({\n 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],\n 'B': [1, 2, 3, 4, 5, 6]\n})\n\ngrouped = df.groupby('A')\n\nresult = grouped.apply(lambda df: df['B'].sum())\nresult\n\nresult = tk.parallel_apply(grouped, lambda df: df['B'].sum(), show_progress=True, threads=2)\nresult\n\n\n\n\nA\nbar 12\nfoo 9\ndtype: int64\n\n\n\n# Example 2 - Multiple arguments returns MultiIndex DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': ['one', 'one', 'one', 'two', 'two', 'two', 'one', 'two'],\n 'C': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return pd.DataFrame({\n 'sum': [group['C'].sum()],\n 'mean': [group['C'].mean()]\n })\n\ngrouped = df.groupby(['A', 'B'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\n\nsum\nmean\n\n\nA\nB\n\n\n\n\n\n\n\nbar\none\n0\n5\n5.000000\n\n\ntwo\n0\n9\n4.500000\n\n\nfoo\none\n0\n8\n2.666667\n\n\ntwo\n0\n15\n7.500000\n\n\n\n\n\n\n\n\n# Example 3 - Multiple arguments returns MultiIndex DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': ['one', 'one', 'one', 'two', 'two', 'two', 'one', 'two'],\n 'C': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return group.head(2)\n\ngrouped = df.groupby(['A', 'B'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\n\nA\nB\nC\n\n\nA\nB\n\n\n\n\n\n\n\n\nbar\none\n2\nbar\none\n5\n\n\ntwo\n3\nbar\ntwo\n7\n\n\n5\nbar\ntwo\n2\n\n\nfoo\none\n0\nfoo\none\n1\n\n\n1\nfoo\none\n3\n\n\ntwo\n4\nfoo\ntwo\n9\n\n\n7\nfoo\ntwo\n6\n\n\n\n\n\n\n\n\n# Example 4 - Single Grouping Column Returns DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return pd.DataFrame({\n 'sum': [group['B'].sum()],\n 'mean': [group['B'].mean()]\n })\n\ngrouped = df.groupby(['A'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\nsum\nmean\n\n\nA\n\n\n\n\n\n\n\nbar\n0\n14\n4.666667\n\n\nfoo\n0\n23\n4.600000" - }, - { - "objectID": "reference/plot_anomalies_decomp.html", - "href": "reference/plot_anomalies_decomp.html", - "title": "plot_anomalies_decomp", - "section": "", - "text": "plot_anomalies_decomp(data, date_column, line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomaly Decomposition Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_decomp function takes in data from the anomalize() function, and returns a plot of the anomaly decomposition." - }, - { - "objectID": "reference/plot_anomalies_decomp.html#parameters", - "href": "reference/plot_anomalies_decomp.html#parameters", - "title": "plot_anomalies_decomp", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomaly Decomposition Plot” by default.\n'Anomaly Decomposition Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA pandas DataFrame containing the filtered data within the specified date range." }, { - "objectID": "reference/plot_anomalies_decomp.html#returns", - "href": "reference/plot_anomalies_decomp.html#returns", - "title": "plot_anomalies_decomp", - "section": "Returns", - "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." + "objectID": "reference/filter_by_time.html#raises", + "href": "reference/filter_by_time.html#raises", + "title": "filter_by_time", + "section": "Raises", + "text": "Raises\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf the provided date strings do not match any of the supported formats." }, { - "objectID": "reference/plot_anomalies_decomp.html#see-also", - "href": "reference/plot_anomalies_decomp.html#see-also", - "title": "plot_anomalies_decomp", - "section": "See Also", - "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." + "objectID": "reference/filter_by_time.html#notes", + "href": "reference/filter_by_time.html#notes", + "title": "filter_by_time", + "section": "Notes", + "text": "Notes\n\nThe function uses pd.to_datetime to convert the start date (e.g. start_date = “2014” becomes “2014-01-01”).\nThe function internally uses the parse_end_date function to convert the end dates (e.g. end_date = “2014” becomes “2014-12-31”)." }, { - "objectID": "reference/plot_anomalies_decomp.html#examples", - "href": "reference/plot_anomalies_decomp.html#examples", - "title": "plot_anomalies_decomp", + "objectID": "reference/filter_by_time.html#examples", + "href": "reference/filter_by_time.html#examples", + "title": "filter_by_time", "section": "Examples", - "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results, plotly\nanomalize_df.plot_anomalies_decomp(\"date\", engine = 'plotly')\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# Visualize the results, plotnine\nanomalize_df.plot_anomalies_decomp(\"date\", engine = \"plotnine\")\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results, plotly\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotly', \n )\n)\n\n\n\n\n\n \n\n\n\n# Visualize the decomposition results, plotnine\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotnine', \n )\n)\n\n\n\n\n<Figure Size: (1200 x 800)>" + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport datetime\n\nm4_daily_df = tk.datasets.load_dataset('m4_daily', parse_dates = ['date'])\n\n\n# Example 1 - Filter by date\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014-07-03',\n end_date = '2014-07-10'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n5\nD10\n2014-07-08\n2017.6\n\n\n6\nD10\n2014-07-09\n2019.1\n\n\n7\nD10\n2014-07-10\n2007.4\n\n\n\n\n\n\n\n\n# Example 2 - Filter by month.\n# Note: This will filter by the first day of the month.\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014-07',\n end_date = '2014-09'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n85\nD10\n2014-09-26\n1987.9\n\n\n86\nD10\n2014-09-27\n1999.8\n\n\n87\nD10\n2014-09-28\n2000.2\n\n\n88\nD10\n2014-09-29\n1996.4\n\n\n89\nD10\n2014-09-30\n2023.5\n\n\n\n\n90 rows × 3 columns\n\n\n\n\n# Example 3 - Filter by year.\n# Note: This will filter by the first day of the year.\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014',\n end_date = '2014'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n177\nD10\n2014-12-27\n2270.1\n\n\n178\nD10\n2014-12-28\n2322.0\n\n\n179\nD10\n2014-12-29\n2327.3\n\n\n180\nD10\n2014-12-30\n2344.9\n\n\n181\nD10\n2014-12-31\n2327.8\n\n\n\n\n182 rows × 3 columns\n\n\n\n\n# Example 4 - Filter by day/hour/minute/second\n# Here we'll use an hourly dataset, however this will also work for minute/second data\n\n# Load data and format date column appropriately\nm4_hourly_df = tk.datasets.load_dataset('m4_hourly', parse_dates = ['date'])\n\ndf_filtered = tk.filter_by_time(\n data = m4_hourly_df,\n date_column = \"date\",\n start_date = '2015-07-01 12:00:00',\n end_date = '2015-07-01 20:00:00'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n5\nH10\n2015-07-01 17:00:00+00:00\n484\n\n\n6\nH10\n2015-07-01 18:00:00+00:00\n467\n\n\n7\nH10\n2015-07-01 19:00:00+00:00\n446\n\n\n8\nH10\n2015-07-01 20:00:00+00:00\n434\n\n\n700\nH50\n2015-07-01 12:00:00+00:00\n39325\n\n\n701\nH50\n2015-07-01 13:00:00+00:00\n38153\n\n\n702\nH50\n2015-07-01 14:00:00+00:00\n36829\n\n\n703\nH50\n2015-07-01 15:00:00+00:00\n35878\n\n\n704\nH50\n2015-07-01 16:00:00+00:00\n33626\n\n\n705\nH50\n2015-07-01 17:00:00+00:00\n31014\n\n\n706\nH50\n2015-07-01 18:00:00+00:00\n28891\n\n\n707\nH50\n2015-07-01 19:00:00+00:00\n27413\n\n\n708\nH50\n2015-07-01 20:00:00+00:00\n26291\n\n\n\n\n\n\n\n\n# Example 5 - Combine year/month/day/hour/minute/second filters\ndf_filtered = tk.filter_by_time(\n data = m4_hourly_df,\n date_column = \"date\",\n start_date = '2015-07-01',\n end_date = '2015-07-29'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n...\n...\n...\n...\n\n\n1379\nH50\n2015-07-29 19:00:00+00:00\n30167\n\n\n1380\nH50\n2015-07-29 20:00:00+00:00\n28894\n\n\n1381\nH50\n2015-07-29 21:00:00+00:00\n27949\n\n\n1382\nH50\n2015-07-29 22:00:00+00:00\n27507\n\n\n1383\nH50\n2015-07-29 23:00:00+00:00\n28020\n\n\n\n\n1368 rows × 3 columns\n\n\n\n\n# Example 6 - Filter a GroupBy object\n\ndf_filtered = (\n m4_hourly_df\n .groupby('id')\n .filter_by_time(\n date_column = \"date\",\n start_date = '2015-07-01 12:00:00',\n end_date = '2015-07-01 20:00:00'\n )\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n5\nH10\n2015-07-01 17:00:00+00:00\n484\n\n\n6\nH10\n2015-07-01 18:00:00+00:00\n467\n\n\n7\nH10\n2015-07-01 19:00:00+00:00\n446\n\n\n8\nH10\n2015-07-01 20:00:00+00:00\n434\n\n\n700\nH50\n2015-07-01 12:00:00+00:00\n39325\n\n\n701\nH50\n2015-07-01 13:00:00+00:00\n38153\n\n\n702\nH50\n2015-07-01 14:00:00+00:00\n36829\n\n\n703\nH50\n2015-07-01 15:00:00+00:00\n35878\n\n\n704\nH50\n2015-07-01 16:00:00+00:00\n33626\n\n\n705\nH50\n2015-07-01 17:00:00+00:00\n31014\n\n\n706\nH50\n2015-07-01 18:00:00+00:00\n28891\n\n\n707\nH50\n2015-07-01 19:00:00+00:00\n27413\n\n\n708\nH50\n2015-07-01 20:00:00+00:00\n26291" }, { - "objectID": "reference/apply_by_time.html", - "href": "reference/apply_by_time.html", - "title": "apply_by_time", + "objectID": "reference/summarize_by_time.html", + "href": "reference/summarize_by_time.html", + "title": "summarize_by_time", "section": "", - "text": "apply_by_time(data, date_column, freq='D', wide_format=False, fillna=0, reduce_memory=False, **named_funcs)\nApply for time series." + "text": "summarize_by_time(data, date_column, value_column, freq='D', agg_func='sum', wide_format=False, fillna=0, engine='pandas')\nSummarize a DataFrame or GroupBy object by time.\nThe summarize_by_time function aggregates data by a specified time period and one or more numeric columns, allowing for grouping and customization of the time-based aggregation." }, { - "objectID": "reference/apply_by_time.html#parameters", - "href": "reference/apply_by_time.html#parameters", - "title": "apply_by_time", + "objectID": "reference/summarize_by_time.html#parameters", + "href": "reference/summarize_by_time.html#parameters", + "title": "summarize_by_time", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the data on which the apply operation will be performed.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the DataFrame that contains the dates.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the data should be resampled. It accepts a string representing a time frequency, such as “D” for daily, “W” for weekly, “M” for monthly, etc. The default value is “D”, which means the data will be resampled on a daily basis. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nwide_format\nbool\nThe wide_format parameter is a boolean flag that determines whether the output should be in wide format or not. If wide_format is set to True, the output will have a multi-index column structure, where the first level represents the original columns and the second level represents the group names.\nFalse\n\n\nfillna\nint\nThe fillna parameter is used to specify the value that will be used to fill missing values in the resulting DataFrame. By default, it is set to 0.\n0\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\n**named_funcs\n\nThe **named_funcs parameter is used to specify one or more custom aggregation functions to apply to the data. It accepts named functions in the format: python name = lambda df: df['column1'].corr(df['column2']]) Where name is the name of the function and df is the DataFrame that will be passed to the function. The function must return a single value.\n{}" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nA pandas DataFrame or a pandas GroupBy object. This is the data that you want to summarize by time.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data frame that contains the dates or timestamps to be aggregated by. This column must be of type datetime64.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is the name of one or more columns in the DataFrame that you want to aggregate by. It can be either a string representing a single column name, or a list of strings representing multiple column names.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the data should be aggregated. It accepts a string representing a pandas frequency offset, such as “D” for daily or “MS” for month start. The default value is “D”, which means the data will be aggregated on a daily basis. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nagg_func\nlist\nThe agg_func parameter is used to specify one or more aggregating functions to apply to the value column(s) during the summarization process. It can be a single function or a list of functions. The default value is \"sum\", which represents the sum function. Some common aggregating functions include: - “sum”: Sum of values - “mean”: Mean of values - “median”: Median of values - “min”: Minimum of values - “max”: Maximum of values - “std”: Standard deviation of values - “var”: Variance of values - “first”: First value in group - “last”: Last value in group - “count”: Count of values - “nunique”: Number of unique values - “corr”: Correlation between values Pandas Engine Only: Custom lambda aggregating functions can be used too. Here are several common examples: - (“q25”, lambda x: x.quantile(0.25)): 25th percentile of values - (“q75”, lambda x: x.quantile(0.75)): 75th percentile of values - (“iqr”, lambda x: x.quantile(0.75) - x.quantile(0.25)): Interquartile range of values - (“range”, lambda x: x.max() - x.min()): Range of values\n'sum'\n\n\nwide_format\nbool\nA boolean parameter that determines whether the output should be in “wide” or “long” format. If set to True, the output will be in wide format, where each group is represented by a separate column. If set to False, the output will be in long format, where each group is represented by a separate row. The default value is False.\nFalse\n\n\nfillna\nint\nThe fillna parameter is used to specify the value to fill missing data with. By default, it is set to 0. If you want to keep missing values as NaN, you can use np.nan as the value for fillna.\n0\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for summarizing the data. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for summarizing the data. This can be faster than using “pandas” for large datasets.\n'pandas'" }, { - "objectID": "reference/apply_by_time.html#returns", - "href": "reference/apply_by_time.html#returns", - "title": "apply_by_time", + "objectID": "reference/summarize_by_time.html#returns", + "href": "reference/summarize_by_time.html#returns", + "title": "summarize_by_time", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function apply_by_time returns a pandas DataFrame object." + "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA Pandas DataFrame that is summarized by time." }, { - "objectID": "reference/apply_by_time.html#examples", - "href": "reference/apply_by_time.html#examples", - "title": "apply_by_time", + "objectID": "reference/summarize_by_time.html#examples", + "href": "reference/summarize_by_time.html#examples", + "title": "summarize_by_time", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n \ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n \ndf.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 2466 rows of 13 columns\norder_id: int64 [1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, ...\norder_line: int64 [1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, ...\norder_date: datetime64[ns] [Timestamp('2011-01-07 00:00:00'), Ti ...\nquantity: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...\nprice: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\ntotal_price: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\nmodel: object ['Jekyll Carbon 2', 'Trigger Carbon 2 ...\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\ncategory_2: object ['Over Mountain', 'Over Mountain', 'T ...\nframe_material: object ['Carbon', 'Carbon', 'Aluminum', 'Car ...\nbikeshop_name: object ['Ithaca Mountain Climbers', 'Ithaca ...\ncity: object ['Ithaca', 'Ithaca', 'Kansas City', ' ...\nstate: object ['NY', 'NY', 'KS', 'KS', 'KY', 'KY', ...\n\n\n\n# Apply by time with a DataFrame object\n# Allows access to multiple columns at once\n( \n df[['order_date', 'price', 'quantity']] \n .apply_by_time(\n \n # Named apply functions\n price_quantity_sum = lambda df: (df['price'] * df['quantity']).sum(),\n price_quantity_mean = lambda df: (df['price'] * df['quantity']).mean(),\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\norder_date\nprice_quantity_sum\nprice_quantity_mean\n\n\n\n\n0\n2011-01-01\n483015.0\n4600.142857\n\n\n1\n2011-02-01\n1162075.0\n4611.408730\n\n\n2\n2011-03-01\n659975.0\n5196.653543\n\n\n3\n2011-04-01\n1827140.0\n4533.846154\n\n\n4\n2011-05-01\n844170.0\n4097.912621\n\n\n5\n2011-06-01\n1413445.0\n4544.839228\n\n\n6\n2011-07-01\n1194430.0\n4976.791667\n\n\n7\n2011-08-01\n679790.0\n4961.970803\n\n\n8\n2011-09-01\n814720.0\n4682.298851\n\n\n9\n2011-10-01\n734920.0\n3930.053476\n\n\n10\n2011-11-01\n1006085.0\n4768.175355\n\n\n11\n2011-12-01\n473120.0\n4186.902655\n\n\n\n\n\n\n\n\n# Apply by time with a GroupBy object\n( \n df[['category_1', 'order_date', 'price', 'quantity']] \n .groupby('category_1')\n .apply_by_time(\n \n # Named functions\n price_quantity_sum = lambda df: (df['price'] * df['quantity']).sum(),\n price_quantity_mean = lambda df: (df['price'] * df['quantity']).mean(),\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\ncategory_1\norder_date\nprice_quantity_sum\nprice_quantity_mean\n\n\n\n\n0\nMountain\n2011-01-01\n221490.0\n4922.000000\n\n\n1\nMountain\n2011-02-01\n660555.0\n4374.536424\n\n\n2\nMountain\n2011-03-01\n358855.0\n5882.868852\n\n\n3\nMountain\n2011-04-01\n1075975.0\n4890.795455\n\n\n4\nMountain\n2011-05-01\n450440.0\n4549.898990\n\n\n5\nMountain\n2011-06-01\n723040.0\n5021.111111\n\n\n6\nMountain\n2011-07-01\n767740.0\n5444.964539\n\n\n7\nMountain\n2011-08-01\n361255.0\n5734.206349\n\n\n8\nMountain\n2011-09-01\n401125.0\n5077.531646\n\n\n9\nMountain\n2011-10-01\n377335.0\n4439.235294\n\n\n10\nMountain\n2011-11-01\n549345.0\n5282.163462\n\n\n11\nMountain\n2011-12-01\n276055.0\n5208.584906\n\n\n12\nRoad\n2011-01-01\n261525.0\n4358.750000\n\n\n13\nRoad\n2011-02-01\n501520.0\n4965.544554\n\n\n14\nRoad\n2011-03-01\n301120.0\n4562.424242\n\n\n15\nRoad\n2011-04-01\n751165.0\n4104.726776\n\n\n16\nRoad\n2011-05-01\n393730.0\n3679.719626\n\n\n17\nRoad\n2011-06-01\n690405.0\n4134.161677\n\n\n18\nRoad\n2011-07-01\n426690.0\n4310.000000\n\n\n19\nRoad\n2011-08-01\n318535.0\n4304.527027\n\n\n20\nRoad\n2011-09-01\n413595.0\n4353.631579\n\n\n21\nRoad\n2011-10-01\n357585.0\n3505.735294\n\n\n22\nRoad\n2011-11-01\n456740.0\n4268.598131\n\n\n23\nRoad\n2011-12-01\n197065.0\n3284.416667\n\n\n\n\n\n\n\n\n# Return complex objects\n( \n df[['order_date', 'price', 'quantity']] \n .apply_by_time(\n \n # Named apply functions\n complex_object = lambda df: [df],\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\norder_date\nprice\nquantity\n\n\n\n\n0\n2011-01-01\n[[6070, 5970, 2770, 5970, 10660, 3200, 12790, ...\n[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,...\n\n\n1\n2011-02-01\n[[8200, 7990, 3200, 4800, 3200, 2130, 1030, 37...\n[[1, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 2,...\n\n\n2\n2011-03-01\n[[2660, 3200, 3200, 815, 8200, 9060, 815, 2130...\n[[1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1,...\n\n\n3\n2011-04-01\n[[5330, 4500, 585, 2660, 3200, 2770, 1030, 234...\n[[1, 1, 1, 3, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 7,...\n\n\n4\n2011-05-01\n[[1840, 3200, 7000, 5860, 1030, 3200, 3500, 15...\n[[1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n5\n2011-06-01\n[[7990, 4500, 1250, 3730, 1950, 2660, 2340, 19...\n[[1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 9,...\n\n\n6\n2011-07-01\n[[3200, 2880, 5330, 3200, 585, 5330, 4800, 111...\n[[2, 3, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1,...\n\n\n7\n2011-08-01\n[[12250, 2130, 7000, 2660, 5860, 3500, 1950, 1...\n[[2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,...\n\n\n8\n2011-09-01\n[[4800, 480, 12790, 6390, 7990, 3500, 3730, 63...\n[[1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n9\n2011-10-01\n[[9060, 12250, 2880, 9060, 4480, 3200, 2340, 2...\n[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n10\n2011-11-01\n[[2240, 2660, 3200, 980, 2880, 1750, 2130, 224...\n[[1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 2, 1, 6,...\n\n\n11\n2011-12-01\n[[1030, 3200, 870, 1350, 4260, 7460, 2880, 270...\n[[1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,..." + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n\ndf\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n# Example 1 - Summarize by time with a DataFrame object, pandas engine\n( \n df \n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = ['mean', 'sum'],\n engine = 'pandas'\n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_mean\ntotal_price_sum\n\n\n\n\n0\n2011-01-01\n4600.142857\n483015\n\n\n1\n2011-02-01\n4611.408730\n1162075\n\n\n2\n2011-03-01\n5196.653543\n659975\n\n\n3\n2011-04-01\n4533.846154\n1827140\n\n\n4\n2011-05-01\n4097.912621\n844170\n\n\n5\n2011-06-01\n4544.839228\n1413445\n\n\n6\n2011-07-01\n4976.791667\n1194430\n\n\n7\n2011-08-01\n4961.970803\n679790\n\n\n8\n2011-09-01\n4682.298851\n814720\n\n\n9\n2011-10-01\n3930.053476\n734920\n\n\n10\n2011-11-01\n4768.175355\n1006085\n\n\n11\n2011-12-01\n4186.902655\n473120\n\n\n\n\n\n\n\n\n# Example 2 - Summarize by time with a GroupBy object (Wide Format), polars engine\n(\n df \n .groupby(['category_1', 'frame_material']) \n .summarize_by_time(\n date_column = 'order_date', \n value_column = ['total_price', 'quantity'], \n freq = 'MS',\n agg_func = 'sum',\n wide_format = True, \n engine = 'polars'\n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum_Mountain_Aluminum\ntotal_price_sum_Mountain_Carbon\ntotal_price_sum_Road_Aluminum\ntotal_price_sum_Road_Carbon\nquantity_sum_Mountain_Aluminum\nquantity_sum_Mountain_Carbon\nquantity_sum_Road_Aluminum\nquantity_sum_Road_Carbon\n\n\n\n\n0\n2011-01-01\n66290\n155200\n61005\n200520\n34\n23\n30\n41\n\n\n1\n2011-02-01\n245115\n415440\n100480\n401040\n118\n68\n52\n93\n\n\n2\n2011-03-01\n82025\n276830\n63390\n237730\n41\n46\n33\n54\n\n\n3\n2011-04-01\n340725\n735250\n197705\n553460\n164\n130\n104\n144\n\n\n4\n2011-05-01\n160130\n290310\n127600\n266130\n93\n53\n75\n81\n\n\n5\n2011-06-01\n183680\n539360\n174655\n515750\n96\n91\n82\n142\n\n\n6\n2011-07-01\n186030\n581710\n98090\n328600\n94\n91\n53\n82\n\n\n7\n2011-08-01\n119785\n241470\n65855\n252680\n53\n34\n36\n69\n\n\n8\n2011-09-01\n100455\n300670\n78485\n335110\n59\n47\n36\n77\n\n\n9\n2011-10-01\n105035\n272300\n83105\n274480\n61\n43\n42\n71\n\n\n10\n2011-11-01\n102045\n447300\n90050\n366690\n55\n79\n51\n95\n\n\n11\n2011-12-01\n111125\n164930\n45555\n151510\n55\n27\n27\n43\n\n\n\n\n\n\n\n\n# Example 3 - Summarize by time with a GroupBy object (Wide Format)\n(\n df \n .groupby('category_1') \n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price', \n freq = 'MS',\n agg_func = 'sum',\n wide_format = True,\n engine = 'pandas' \n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_Mountain\ntotal_price_Road\n\n\n\n\n0\n2011-01-01\n221490\n261525\n\n\n1\n2011-02-01\n660555\n501520\n\n\n2\n2011-03-01\n358855\n301120\n\n\n3\n2011-04-01\n1075975\n751165\n\n\n4\n2011-05-01\n450440\n393730\n\n\n5\n2011-06-01\n723040\n690405\n\n\n6\n2011-07-01\n767740\n426690\n\n\n7\n2011-08-01\n361255\n318535\n\n\n8\n2011-09-01\n401125\n413595\n\n\n9\n2011-10-01\n377335\n357585\n\n\n10\n2011-11-01\n549345\n456740\n\n\n11\n2011-12-01\n276055\n197065\n\n\n\n\n\n\n\n\n# Example 4 - Summarize by time with a GroupBy object and multiple value columns and summaries (Wide Format)\n# Note - This example only works with the pandas engine\n(\n df \n .groupby('category_1') \n .summarize_by_time(\n date_column = 'order_date', \n value_column = ['total_price', 'quantity'], \n freq = 'MS',\n agg_func = [\n 'sum', \n 'mean', \n ('q25', lambda x: x.quantile(0.25)), \n ('q75', lambda x: x.quantile(0.75))\n ],\n wide_format = False,\n engine = 'pandas' \n )\n)\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price_sum\ntotal_price_mean\ntotal_price_q25\ntotal_price_q75\nquantity_sum\nquantity_mean\nquantity_q25\nquantity_q75\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n4922.000000\n2060.0\n6070.0\n57\n1.266667\n1.0\n1.0\n\n\n1\nMountain\n2011-02-01\n660555\n4374.536424\n2060.0\n5330.0\n186\n1.231788\n1.0\n1.0\n\n\n2\nMountain\n2011-03-01\n358855\n5882.868852\n2130.0\n6390.0\n87\n1.426230\n1.0\n1.0\n\n\n3\nMountain\n2011-04-01\n1075975\n4890.795455\n2060.0\n5970.0\n294\n1.336364\n1.0\n1.0\n\n\n4\nMountain\n2011-05-01\n450440\n4549.898990\n2010.0\n6020.0\n146\n1.474747\n1.0\n1.0\n\n\n5\nMountain\n2011-06-01\n723040\n5021.111111\n1950.0\n5647.5\n187\n1.298611\n1.0\n1.0\n\n\n6\nMountain\n2011-07-01\n767740\n5444.964539\n2130.0\n6400.0\n185\n1.312057\n1.0\n1.0\n\n\n7\nMountain\n2011-08-01\n361255\n5734.206349\n2235.0\n6400.0\n87\n1.380952\n1.0\n2.0\n\n\n8\nMountain\n2011-09-01\n401125\n5077.531646\n1620.0\n6390.0\n106\n1.341772\n1.0\n1.0\n\n\n9\nMountain\n2011-10-01\n377335\n4439.235294\n2160.0\n6070.0\n104\n1.223529\n1.0\n1.0\n\n\n10\nMountain\n2011-11-01\n549345\n5282.163462\n2340.0\n7460.0\n134\n1.288462\n1.0\n1.0\n\n\n11\nMountain\n2011-12-01\n276055\n5208.584906\n2060.0\n6400.0\n82\n1.547170\n1.0\n1.0\n\n\n12\nRoad\n2011-01-01\n261525\n4358.750000\n1950.0\n5605.0\n71\n1.183333\n1.0\n1.0\n\n\n13\nRoad\n2011-02-01\n501520\n4965.544554\n1950.0\n5860.0\n145\n1.435644\n1.0\n1.0\n\n\n14\nRoad\n2011-03-01\n301120\n4562.424242\n2240.0\n5875.0\n87\n1.318182\n1.0\n1.0\n\n\n15\nRoad\n2011-04-01\n751165\n4104.726776\n1950.0\n4800.0\n248\n1.355191\n1.0\n1.0\n\n\n16\nRoad\n2011-05-01\n393730\n3679.719626\n1570.0\n3500.0\n156\n1.457944\n1.0\n1.0\n\n\n17\nRoad\n2011-06-01\n690405\n4134.161677\n1840.0\n4500.0\n224\n1.341317\n1.0\n1.0\n\n\n18\nRoad\n2011-07-01\n426690\n4310.000000\n1895.0\n5330.0\n135\n1.363636\n1.0\n1.0\n\n\n19\nRoad\n2011-08-01\n318535\n4304.527027\n1950.0\n4987.5\n105\n1.418919\n1.0\n1.0\n\n\n20\nRoad\n2011-09-01\n413595\n4353.631579\n1950.0\n5330.0\n113\n1.189474\n1.0\n1.0\n\n\n21\nRoad\n2011-10-01\n357585\n3505.735294\n1750.0\n4260.0\n113\n1.107843\n1.0\n1.0\n\n\n22\nRoad\n2011-11-01\n456740\n4268.598131\n1950.0\n4370.0\n146\n1.364486\n1.0\n1.0\n\n\n23\nRoad\n2011-12-01\n197065\n3284.416667\n1652.5\n3200.0\n70\n1.166667\n1.0\n1.0" }, { - "objectID": "reference/augment_bbands.html", - "href": "reference/augment_bbands.html", - "title": "augment_bbands", + "objectID": "reference/drop_zero_variance.html", + "href": "reference/drop_zero_variance.html", + "title": "drop_zero_variance", "section": "", - "text": "augment_bbands(data, date_column, close_column, periods=20, std_dev=2, reduce_memory=False, engine='pandas')\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return the augmented dataset." + "text": "drop_zero_variance(data)\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with columns that have zero variance removed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the\nrequired\n\n\ndata\npd.DataFrame\n\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\n\na filtered DataFrame with columns that have non-zero variance." }, { - "objectID": "reference/augment_bbands.html#parameters", - "href": "reference/augment_bbands.html#parameters", - "title": "augment_bbands", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It contains the data on which the Bollinger Bands will be calculated.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the data DataFrame that contains the dates.\nrequired\n\n\nclose_column\nstr\nThe close_column parameter is a string that specifies the name of the column in the data DataFrame that contains the closing prices of the asset.\nrequired\n\n\nperiods\nUnion[int, Tuple[int, int], List[int]]\nThe periods parameter in the augment_bbands function can be specified as an integer, a tuple, or a list. This parameter specifies the number of rolling periods to use when calculating the Bollinger Bands.\n20\n\n\nstd_dev\nfloat\nThe std_dev parameter is a float that represents the number of standard deviations to use when calculating the Bollinger Bands. Bollinger Bands are a technical analysis tool that consists of a middle band (usually a simple moving average) and an upper and lower band that are typically two standard deviations away from the middle band. The std_dev parameter specifies the number of standard deviations. std_dev can be a list of floats as well.\n2\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is a boolean flag that indicates whether or not to reduce the memory usage of the input data before performing the calculation. If set to True, the function will attempt to reduce the memory usage of the input data using techniques such as downcasting numeric columns and converting object columns\nFalse\n\n\nengine\nstr\nThe engine parameter specifies the computation engine to use for calculating the Bollinger Bands. It can take two values: ‘pandas’ or ‘polars’. If ‘pandas’ is selected, the function will use the pandas library for computation. If ‘polars’ is selected,\n'pandas'" + "objectID": "reference/drop_zero_variance.html#parameters", + "href": "reference/drop_zero_variance.html#parameters", + "title": "drop_zero_variance", + "section": "", + "text": "Name\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the\nrequired\n\n\ndata\npd.DataFrame\n\nrequired" }, { - "objectID": "reference/augment_bbands.html#returns", - "href": "reference/augment_bbands.html#returns", - "title": "augment_bbands", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function augment_bbands returns a pandas DataFrame." + "objectID": "reference/drop_zero_variance.html#returns", + "href": "reference/drop_zero_variance.html#returns", + "title": "drop_zero_variance", + "section": "", + "text": "Type\nDescription\n\n\n\n\n\na filtered DataFrame with columns that have non-zero variance." }, { - "objectID": "reference/augment_bbands.html#notes", - "href": "reference/augment_bbands.html#notes", - "title": "augment_bbands", - "section": "Notes", - "text": "Notes\nBollinger Bands are a technical analysis tool developed by John Bollinger in the 1980s. They are used to measure the ‘volatility’ of a stock price or other financial instrument. This indicator consists of three lines which are plotted in relation to an asset’s price:\n\nThe Middle Band: This is typically a simple moving average (SMA) of the closing prices over a certain number of days (commonly 20 days).\nThe Upper Band: This is set a specified number of standard deviations (usually two) above the middle band.\nThe Lower Band: This is set the same number of standard deviations (again, usually two) below the middle band.\n\nVolatility Indicator: The width of the bands is a measure of volatility. When the bands widen, it indicates increased volatility, and when they contract, it suggests decreased volatility.\nOverbought and Oversold Conditions: Prices are considered overbought near the upper band and oversold near the lower band. However, these conditions do not necessarily signal a reversal; prices can remain overbought or oversold for extended periods during strong trends." + "objectID": "reference/index.html", + "href": "reference/index.html", + "title": "Function reference", + "section": "", + "text": "Visualize time series data with one line of code.\n\n\n\nplot_timeseries\nCreates time series plots using different plotting engines such as Plotnine,\n\n\n\n\n\n\nBend time series data to your will.\n\n\n\nsummarize_by_time\nSummarize a DataFrame or GroupBy object by time.\n\n\napply_by_time\nApply for time series.\n\n\npad_by_time\nMake irregular time series regular by padding with missing dates.\n\n\nfilter_by_time\nFilters a DataFrame or GroupBy object based on a specified date range.\n\n\nfuture_frame\nExtend a DataFrame or GroupBy object with future dates.\n\n\n\n\n\n\nDetect anomalies in time series data.\n\n\n\nanomalize\nDetects anomalies in time series data, either for a single time\n\n\nplot_anomalies\nCreates plot of anomalies in time series data using Plotly, Matplotlib,\n\n\nplot_anomalies_decomp\nThe plot_anomalies_decomp function takes in data from the anomalize()\n\n\nplot_anomalies_cleaned\nThe plot_anomalies_cleaned function takes in data from the anomalize()\n\n\n\n\n\n\nVisualize correlation on any tabular dataset (not just for Time Series).\n\n\n\nbinarize\nThe binarize function prepares data for correlate, which is used for analyzing correlationfunnel plots.\n\n\ncorrelate\nThe correlate function calculates the correlation between a target variable and all other\n\n\nplot_correlation_funnel\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or\n\n\n\n\n\n\nAdding Features to Time Series DataFrames (Augmenting)\n\n\n\naugment_timeseries_signature\nThe function augment_timeseries_signature takes a DataFrame and a date\n\n\naugment_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\naugment_lags\nAdds lags to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_leads\nAdds leads to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_diffs\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_pct_change\nAdds percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_rolling\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame.\n\n\naugment_rolling_apply\nApply one or more DataFrame-based rolling functions and window sizes to one\n\n\naugment_expanding\nApply one or more Series-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_expanding_apply\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_ewm\nAdd Exponential Weighted Moving (EWM) window functions to a DataFrame or\n\n\naugment_fourier\nAdds Fourier transforms to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_hilbert\nApply the Hilbert transform to specified columns of a DataFrame or\n\n\naugment_wavelet\nApply the Wavely transform to specified columns of a DataFrame or\n\n\n\n\n\n\nPython implementation of the R package tsfeatures.\n\n\n\nts_features\nExtracts aggregated time series features from a DataFrame or DataFrameGroupBy object using the tsfeatures package.\n\n\nts_summary\nComputes summary statistics for a time series data, either for the entire\n\n\n\n\n\n\nTime series cross validation.\n\n\n\nTimeSeriesCV\nTimeSeriesCV is a subclass of TimeBasedSplit with default mode set to ‘backward’\n\n\nTimeSeriesCVSplitter\nThe TimeSeriesCVSplitter is a scikit-learn compatible cross-validator using TimeSeriesCV.\n\n\n\n\n\n\nMomentum indicators for financial time series data.\n\n\n\naugment_macd\nCalculate MACD for a given financial instrument using either pandas or polars engine.\n\n\naugment_ppo\nCalculate PPO for a given financial instrument using either pandas or polars engine.\n\n\naugment_rsi\nThe augment_rsi function calculates the Relative Strength Index (RSI) for a given financial\n\n\naugment_cmo\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial\n\n\naugment_roc\nAdds rate of change (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_qsmomentum\nThe function augment_qsmomentum calculates Quant Science Momentum for financial data.\n\n\n\n\n\n\nVolatility indicators for financial time series data.\n\n\n\naugment_bbands\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return\n\n\naugment_atr\nThe augment_atr function is used to calculate Average True Range (ATR) and\n\n\n\n\n\n\nTime series functions that generate / manipulate Pandas Series.\n\n\n\nmake_future_timeseries\nMake future dates for a time series.\n\n\nmake_weekday_sequence\nGenerate a sequence of weekday dates within a specified date range,\n\n\nmake_weekend_sequence\nGenerate a sequence of weekend dates within a specified date range,\n\n\nget_date_summary\nReturns a summary of the date-related information, including the number of\n\n\nget_frequency_summary\nMore robust version of pandas inferred frequency.\n\n\nget_diff_summary\nCalculates summary statistics of the time differences between consecutive values in a datetime index.\n\n\nget_frequency\nGet the frequency of a pandas Series or DatetimeIndex.\n\n\nget_seasonal_frequency\nThe get_seasonal_frequency function returns the seasonal period of a given\n\n\nget_trend_frequency\nThe get_trend_frequency function returns the trend period of a given time\n\n\nget_timeseries_signature\nConvert a timestamp to a set of 29 time series features.\n\n\nget_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\n\n\n\n\nHelper functions to make your life easier.\n\n\n\nfloor_date\nRobust date flooring.\n\n\nceil_date\nRobust date ceiling.\n\n\nis_holiday\nCheck if a given list of dates are holidays for a specified country.\n\n\nweek_of_month\nThe “week_of_month” function calculates the week number of a given date\n\n\ntimeseries_unit_frequency_table\nThe function timeseries_unit_frequency_table returns a pandas DataFrame\n\n\ntime_scale_template\nThe function time_scale_template returns a table with time scale\n\n\n\n\n\n\nHelper functions to make your life easier.\n\n\n\ntheme_timetk\nReturns a plotnine theme with timetk styles applied, allowing for\n\n\npalette_timetk\nThe function palette_timetk returns a dictionary of color codes for\n\n\n\n\n\n\n\n\n\nglimpse\nTakes a pandas DataFrame and prints a summary of its dimensions, column\n\n\nparallel_apply\nThe parallel_apply function parallelizes the application of a function on\n\n\nprogress_apply\nAdds a progress bar to pandas apply().\n\n\ndrop_zero_variance\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with\n\n\ntransform_columns\nThe function transform_columns applies a user-provided function to specified columns in a pandas DataFrame.\n\n\nflatten_multiindex_column_names\nTakes a DataFrame as input and flattens the column\n\n\n\n\n\n\nPractice pytimetk with 13 complementary time series datasets.\n\n\n\nget_available_datasets\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\n\n\nload_dataset\nLoad one of 12 Time Series Datasets." }, { - "objectID": "reference/augment_bbands.html#examples", - "href": "reference/augment_bbands.html#examples", - "title": "augment_bbands", - "section": "Examples", - "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset(\"stocks_daily\", parse_dates = ['date'])\n\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# BBANDS pandas engine\ndf_bbands = (\n df\n .groupby('symbol')\n .augment_bbands(\n date_column = 'date', \n close_column='close', \n periods = [20, 40],\n std_dev = 2, \n engine = \"pandas\"\n )\n)\n\ndf_bbands.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 14 columns\nsymbol: object ['META', 'META', 'META', ' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00: ...\nopen: float64 [27.440000534057617, 27.87 ...\nhigh: float64 [28.18000030517578, 28.469 ...\nlow: float64 [27.420000076293945, 27.59 ...\nclose: float64 [28.0, 27.770000457763672, ...\nvolume: int64 [69846400, 63140600, 72715 ...\nadjusted: float64 [28.0, 27.770000457763672, ...\nclose_bband_middle_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_middle_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_40_2.0: float64 [nan, nan, nan, nan, nan, ...\n\n\n\n# BBANDS polars engine\ndf_bbands = (\n df\n .groupby('symbol')\n .augment_bbands(\n date_column = 'date', \n close_column='close', \n periods = [20, 40],\n std_dev = 2, \n engine = \"polars\"\n )\n)\n\ndf_bbands.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 14 columns\nsymbol: object ['META', 'META', 'META', ' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00: ...\nopen: float64 [27.440000534057617, 27.87 ...\nhigh: float64 [28.18000030517578, 28.469 ...\nlow: float64 [27.420000076293945, 27.59 ...\nclose: float64 [28.0, 27.770000457763672, ...\nvolume: int64 [69846400, 63140600, 72715 ...\nadjusted: float64 [28.0, 27.770000457763672, ...\nclose_bband_middle_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_middle_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_40_2.0: float64 [nan, nan, nan, nan, nan, ..." + "objectID": "reference/index.html#data-visualization", + "href": "reference/index.html#data-visualization", + "title": "Function reference", + "section": "", + "text": "Visualize time series data with one line of code.\n\n\n\nplot_timeseries\nCreates time series plots using different plotting engines such as Plotnine," }, { - "objectID": "reference/augment_ppo.html", - "href": "reference/augment_ppo.html", - "title": "augment_ppo", + "objectID": "reference/index.html#wrangling-pandas-time-series-dataframes", + "href": "reference/index.html#wrangling-pandas-time-series-dataframes", + "title": "Function reference", "section": "", - "text": "augment_ppo(data, date_column, close_column, fast_period=12, slow_period=26, reduce_memory=False, engine='pandas')\nCalculate PPO for a given financial instrument using either pandas or polars engine." + "text": "Bend time series data to your will.\n\n\n\nsummarize_by_time\nSummarize a DataFrame or GroupBy object by time.\n\n\napply_by_time\nApply for time series.\n\n\npad_by_time\nMake irregular time series regular by padding with missing dates.\n\n\nfilter_by_time\nFilters a DataFrame or GroupBy object based on a specified date range.\n\n\nfuture_frame\nExtend a DataFrame or GroupBy object with future dates." }, { - "objectID": "reference/augment_ppo.html#parameters", - "href": "reference/augment_ppo.html#parameters", - "title": "augment_ppo", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nPandas DataFrame or GroupBy object containing financial data.\nrequired\n\n\ndate_column\nstr\nName of the column containing date information.\nrequired\n\n\nclose_column\nstr\nName of the column containing closing price data.\nrequired\n\n\nfast_period\nint\nNumber of periods for the fast EMA in PPO calculation.\n12\n\n\nslow_period\nint\nNumber of periods for the slow EMA in PPO calculation.\n26\n\n\nreduce_memory\nbool\nWhether to reduce memory usage of the data before performing the calculation.\nFalse\n\n\nengine\nstr\nComputation engine to use (‘pandas’ or ‘polars’).\n'pandas'" + "objectID": "reference/index.html#anomaly-detection", + "href": "reference/index.html#anomaly-detection", + "title": "Function reference", + "section": "", + "text": "Detect anomalies in time series data.\n\n\n\nanomalize\nDetects anomalies in time series data, either for a single time\n\n\nplot_anomalies\nCreates plot of anomalies in time series data using Plotly, Matplotlib,\n\n\nplot_anomalies_decomp\nThe plot_anomalies_decomp function takes in data from the anomalize()\n\n\nplot_anomalies_cleaned\nThe plot_anomalies_cleaned function takes in data from the anomalize()" }, { - "objectID": "reference/augment_ppo.html#returns", - "href": "reference/augment_ppo.html#returns", - "title": "augment_ppo", - "section": "Returns", - "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nDataFrame with PPO values added." + "objectID": "reference/index.html#correlation-funnel", + "href": "reference/index.html#correlation-funnel", + "title": "Function reference", + "section": "", + "text": "Visualize correlation on any tabular dataset (not just for Time Series).\n\n\n\nbinarize\nThe binarize function prepares data for correlate, which is used for analyzing correlationfunnel plots.\n\n\ncorrelate\nThe correlate function calculates the correlation between a target variable and all other\n\n\nplot_correlation_funnel\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or" }, { - "objectID": "reference/augment_ppo.html#notes", - "href": "reference/augment_ppo.html#notes", - "title": "augment_ppo", - "section": "Notes", - "text": "Notes\nThe Percentage Price Oscillator (PPO) is a momentum oscillator that measures the difference between two moving averages as a percentage of the larger moving average. The PPO is best used to confirm the direction of the price trend and gauge its momentum.\nThe PPO is calculated by subtracting a long-term EMA from a short-term EMA, then dividing the result by the long-term EMA, and finally multiplying by 100.\nAdvantages Over MACD: The PPO’s percentage-based calculation allows for easier comparisons between different securities, regardless of their price levels. This is a distinct advantage over the MACD, which provides absolute values and can be less meaningful when comparing stocks with significantly different prices." + "objectID": "reference/index.html#feature-engineereing", + "href": "reference/index.html#feature-engineereing", + "title": "Function reference", + "section": "", + "text": "Adding Features to Time Series DataFrames (Augmenting)\n\n\n\naugment_timeseries_signature\nThe function augment_timeseries_signature takes a DataFrame and a date\n\n\naugment_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\naugment_lags\nAdds lags to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_leads\nAdds leads to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_diffs\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_pct_change\nAdds percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_rolling\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame.\n\n\naugment_rolling_apply\nApply one or more DataFrame-based rolling functions and window sizes to one\n\n\naugment_expanding\nApply one or more Series-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_expanding_apply\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_ewm\nAdd Exponential Weighted Moving (EWM) window functions to a DataFrame or\n\n\naugment_fourier\nAdds Fourier transforms to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_hilbert\nApply the Hilbert transform to specified columns of a DataFrame or\n\n\naugment_wavelet\nApply the Wavely transform to specified columns of a DataFrame or" }, { - "objectID": "reference/augment_ppo.html#examples", - "href": "reference/augment_ppo.html#examples", - "title": "augment_ppo", - "section": "Examples", - "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset(\"stocks_daily\", parse_dates = ['date'])\n\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# PPO pandas engine\ndf_ppo = (\n df\n .groupby('symbol')\n .augment_ppo(\n date_column = 'date', \n close_column = 'close', \n fast_period = 12, \n slow_period = 26, \n engine = \"pandas\"\n )\n)\n\ndf_ppo.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 9 columns\nsymbol: object ['AAPL', 'AAPL', 'AAPL', 'AAPL' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00:00:00 ...\nopen: float64 [19.779285430908203, 19.5671424 ...\nhigh: float64 [19.821428298950195, 19.6310710 ...\nlow: float64 [19.343929290771484, 19.3214282 ...\nclose: float64 [19.608213424682617, 19.3607139 ...\nvolume: int64 [560518000, 352965200, 59433360 ...\nadjusted: float64 [16.791179656982422, 16.5792407 ...\nclose_ppo_line_12_26: float64 [0.0, -0.10078442036791524, -0. ...\n\n\n\n# PPO polars engine\ndf_ppo = (\n df\n .groupby('symbol')\n .augment_ppo(\n date_column = 'date', \n close_column = 'close', \n fast_period = 12, \n slow_period = 26, \n engine = \"polars\"\n )\n)\n\ndf_ppo.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 9 columns\nsymbol: object ['META', 'META', 'META', 'META' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00:00:00 ...\nopen: float64 [27.440000534057617, 27.8799991 ...\nhigh: float64 [28.18000030517578, 28.46999931 ...\nlow: float64 [27.420000076293945, 27.5900001 ...\nclose: float64 [28.0, 27.770000457763672, 28.7 ...\nvolume: int64 [69846400, 63140600, 72715400, ...\nadjusted: float64 [28.0, 27.770000457763672, 28.7 ...\nclose_ppo_line_12_26: float64 [0.0, -0.06556683019189882, 0.1 ..." + "objectID": "reference/index.html#ts-features", + "href": "reference/index.html#ts-features", + "title": "Function reference", + "section": "", + "text": "Python implementation of the R package tsfeatures.\n\n\n\nts_features\nExtracts aggregated time series features from a DataFrame or DataFrameGroupBy object using the tsfeatures package.\n\n\nts_summary\nComputes summary statistics for a time series data, either for the entire" }, { - "objectID": "reference/glimpse.html", - "href": "reference/glimpse.html", - "title": "glimpse", + "objectID": "reference/index.html#time-series-cross-validation-tscv", + "href": "reference/index.html#time-series-cross-validation-tscv", + "title": "Function reference", "section": "", - "text": "glimpse(data, max_width=76, engine='pandas')\nTakes a pandas DataFrame and prints a summary of its dimensions, column names, data types, and the first few values of each column." + "text": "Time series cross validation.\n\n\n\nTimeSeriesCV\nTimeSeriesCV is a subclass of TimeBasedSplit with default mode set to ‘backward’\n\n\nTimeSeriesCVSplitter\nThe TimeSeriesCVSplitter is a scikit-learn compatible cross-validator using TimeSeriesCV." }, { - "objectID": "reference/glimpse.html#parameters", - "href": "reference/glimpse.html#parameters", - "title": "glimpse", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame that contains the data you want to glimpse at. It is the main input to the glimpse function.\nrequired\n\n\nmax_width\nint\nThe max_width parameter is an optional parameter that specifies the maximum width of each line when printing the glimpse of the DataFrame. If not provided, the default value is set to 76.\n76\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a glimpse. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the glimpse.\n'pandas'" + "objectID": "reference/index.html#finance-module-momentum-indicators", + "href": "reference/index.html#finance-module-momentum-indicators", + "title": "Function reference", + "section": "", + "text": "Momentum indicators for financial time series data.\n\n\n\naugment_macd\nCalculate MACD for a given financial instrument using either pandas or polars engine.\n\n\naugment_ppo\nCalculate PPO for a given financial instrument using either pandas or polars engine.\n\n\naugment_rsi\nThe augment_rsi function calculates the Relative Strength Index (RSI) for a given financial\n\n\naugment_cmo\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial\n\n\naugment_roc\nAdds rate of change (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_qsmomentum\nThe function augment_qsmomentum calculates Quant Science Momentum for financial data." }, { - "objectID": "reference/glimpse.html#examples", - "href": "reference/glimpse.html#examples", - "title": "glimpse", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('walmart_sales_weekly', parse_dates=['Date'])\n\ndf.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 1001 rows of 17 columns\nid: object ['1_1', '1_1', '1_1', '1_1', '1_1', '1_ ...\nStore: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDept: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate: datetime64[ns] [Timestamp('2010-02-05 00:00:00'), Time ...\nWeekly_Sales: float64 [24924.5, 46039.49, 41595.55, 19403.54, ...\nIsHoliday: bool [False, True, False, False, False, Fals ...\nType: object ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A' ...\nSize: int64 [151315, 151315, 151315, 151315, 151315 ...\nTemperature: float64 [42.31, 38.51, 39.93, 46.63, 46.5, 57.7 ...\nFuel_Price: float64 [2.572, 2.548, 2.514, 2.561, 2.625, 2.6 ...\nMarkDown1: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown2: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown3: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown4: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown5: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nCPI: float64 [211.0963582, 211.2421698, 211.2891429, ...\nUnemployment: float64 [8.106, 8.106, 8.106, 8.106, 8.106, 8.1 ..." + "objectID": "reference/index.html#finance-module-volatility-indicators", + "href": "reference/index.html#finance-module-volatility-indicators", + "title": "Function reference", + "section": "", + "text": "Volatility indicators for financial time series data.\n\n\n\naugment_bbands\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return\n\n\naugment_atr\nThe augment_atr function is used to calculate Average True Range (ATR) and" }, { - "objectID": "reference/plot_correlation_funnel.html", - "href": "reference/plot_correlation_funnel.html", - "title": "plot_correlation_funnel", + "objectID": "reference/index.html#time-series-for-pandas-series", + "href": "reference/index.html#time-series-for-pandas-series", + "title": "Function reference", "section": "", - "text": "plot_correlation_funnel(data, limits=(-1, 1), alpha=1.0, title='Correlation Funnel Plot', x_lab='Correlation', y_lab='Feature', base_size=11, width=None, height=None, engine='plotly')\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or plotnine in Python." + "text": "Time series functions that generate / manipulate Pandas Series.\n\n\n\nmake_future_timeseries\nMake future dates for a time series.\n\n\nmake_weekday_sequence\nGenerate a sequence of weekday dates within a specified date range,\n\n\nmake_weekend_sequence\nGenerate a sequence of weekend dates within a specified date range,\n\n\nget_date_summary\nReturns a summary of the date-related information, including the number of\n\n\nget_frequency_summary\nMore robust version of pandas inferred frequency.\n\n\nget_diff_summary\nCalculates summary statistics of the time differences between consecutive values in a datetime index.\n\n\nget_frequency\nGet the frequency of a pandas Series or DatetimeIndex.\n\n\nget_seasonal_frequency\nThe get_seasonal_frequency function returns the seasonal period of a given\n\n\nget_trend_frequency\nThe get_trend_frequency function returns the trend period of a given time\n\n\nget_timeseries_signature\nConvert a timestamp to a set of 29 time series features.\n\n\nget_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries" }, { - "objectID": "reference/plot_correlation_funnel.html#parameters", - "href": "reference/plot_correlation_funnel.html#parameters", - "title": "plot_correlation_funnel", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame that contains the correlation values and corresponding features. It should have two columns: ‘correlation’ and ‘feature’.\nrequired\n\n\nlimits\ntuple\nThe limits parameter is a tuple that specifies the lower and upper limits of the x-axis in the correlation funnel plot. By default, the limits are set to (-1, 1), which means the x-axis will range from -1 to 1.\n(-1, 1)\n\n\nalpha\nfloat\nThe alpha parameter determines the transparency of the data points in the plot. A value of 1.0 means the points are fully opaque, while a value less than 1.0 makes the points more transparent.\n1.0\n\n\ntitle\nstr\nThe title of the plot.\n'Correlation Funnel Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It represents the label for the correlation values.\n'Correlation'\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis in the correlation funnel plot. It represents the name or description of the feature being plotted.\n'Feature'\n\n\nbase_size\nfloat\nThe base_size parameter is used to set the base font size for the plot. It is multiplied by different factors to determine the font sizes for various elements of the plot, such as the title, axis labels, tick labels, legend, and annotations.\n11\n\n\nwidth\nOptional[int]\nThe width parameter is used to specify the width of the plot in pixels. It determines the horizontal size of the plot.\nNone\n\n\nheight\nOptional[int]\nThe height parameter is used to specify the height of the plot in pixels. It determines the vertical size of the plot when it is rendered.\nNone\n\n\nengine\nstr\nThe engine parameter determines the plotting engine to be used. It can be set to either “plotly” or “plotnine”. If set to “plotly”, the function will generate an interactive plot using the Plotly library. If set to “plotnine”, it will generate a static plot using the plotnine library. The default value is “plotly”.\n'plotly'" + "objectID": "reference/index.html#date-utilities", + "href": "reference/index.html#date-utilities", + "title": "Function reference", + "section": "", + "text": "Helper functions to make your life easier.\n\n\n\nfloor_date\nRobust date flooring.\n\n\nceil_date\nRobust date ceiling.\n\n\nis_holiday\nCheck if a given list of dates are holidays for a specified country.\n\n\nweek_of_month\nThe “week_of_month” function calculates the week number of a given date\n\n\ntimeseries_unit_frequency_table\nThe function timeseries_unit_frequency_table returns a pandas DataFrame\n\n\ntime_scale_template\nThe function time_scale_template returns a table with time scale" }, { - "objectID": "reference/plot_correlation_funnel.html#returns", - "href": "reference/plot_correlation_funnel.html#returns", - "title": "plot_correlation_funnel", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function plot_correlation_funnel returns a plotly figure object if the engine parameter is\nset to ‘plotly’, and a plotnine object if the engine parameter is set to ‘plotnine’." + "objectID": "reference/index.html#visualization-utilities", + "href": "reference/index.html#visualization-utilities", + "title": "Function reference", + "section": "", + "text": "Helper functions to make your life easier.\n\n\n\ntheme_timetk\nReturns a plotnine theme with timetk styles applied, allowing for\n\n\npalette_timetk\nThe function palette_timetk returns a dictionary of color codes for" }, { - "objectID": "reference/plot_correlation_funnel.html#see-also", - "href": "reference/plot_correlation_funnel.html#see-also", - "title": "plot_correlation_funnel", - "section": "See Also", - "text": "See Also\n\nbinarize(): Binarize the dataset into 1’s and 0’s.\ncorrelate(): Calculate the correlation between features in a pandas DataFrame." + "objectID": "reference/index.html#extra-pandas-helpers-that-help-beyond-just-time-series", + "href": "reference/index.html#extra-pandas-helpers-that-help-beyond-just-time-series", + "title": "Function reference", + "section": "", + "text": "glimpse\nTakes a pandas DataFrame and prints a summary of its dimensions, column\n\n\nparallel_apply\nThe parallel_apply function parallelizes the application of a function on\n\n\nprogress_apply\nAdds a progress bar to pandas apply().\n\n\ndrop_zero_variance\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with\n\n\ntransform_columns\nThe function transform_columns applies a user-provided function to specified columns in a pandas DataFrame.\n\n\nflatten_multiindex_column_names\nTakes a DataFrame as input and flattens the column" }, { - "objectID": "reference/plot_correlation_funnel.html#examples", - "href": "reference/plot_correlation_funnel.html#examples", - "title": "plot_correlation_funnel", - "section": "Examples", - "text": "Examples\n\n# NON-TIMESERIES EXAMPLE ----\n\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\n# Set a random seed for reproducibility\nnp.random.seed(0)\n\n# Define the number of rows for your DataFrame\nnum_rows = 200\n\n# Create fake data for the columns\ndata = {\n 'Age': np.random.randint(18, 65, size=num_rows),\n 'Gender': np.random.choice(['Male', 'Female'], size=num_rows),\n 'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),\n 'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),\n 'Years_Playing': np.random.randint(0, 30, size=num_rows),\n 'Average_Income': np.random.randint(20000, 100000, size=num_rows),\n 'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),\n 'Number_Children': np.random.randint(0, 5, size=num_rows),\n 'Own_House_Flag': np.random.choice([True, False], size=num_rows),\n 'Own_Car_Count': np.random.randint(0, 3, size=num_rows),\n 'PersonId': range(1, num_rows + 1), # Add a PersonId column as a row count\n 'Client': np.random.choice(['A', 'B'], size=num_rows) # Add a Client column with random values 'A' or 'B'\n}\n\n# Create a DataFrame\ndf = pd.DataFrame(data)\n\n# Binarize the data\ndf_binarized = df.binarize(n_bins=4, thresh_infreq=0.01, name_infreq=\"-OTHER\", one_hot=True)\n\ndf_binarized.glimpse() \n\n[]\n<class 'pandas.core.frame.DataFrame'>: 200 rows of 42 columns\nAge__18.0_29.5: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nAge__29.5_41.0: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nAge__41.0_52.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nAge__52.5_64.0: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nYears_Playing__0.0_7.2: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nYears_Playing__7.2_14.5: uint8 [0, 0, 1, 0, 1, 0, 1 ...\nYears_Playing__14.5_21.8: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nYears_Playing__21.8_29.0: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nAverage_Income__20131.0_39881.0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nAverage_Income__39881.0_59631.0: uint8 [0, 0, 0, 1, 1, 0, 1 ...\nAverage_Income__59631.0_79381.0: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nAverage_Income__79381.0_99210.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nPersonId__1.0_50.8: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nPersonId__50.8_100.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__100.5_150.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__150.2_200.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nGender__Female: uint8 [1, 0, 0, 0, 1, 0, 1 ...\nGender__Male: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nMarital_Status__Divorced: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nMarital_Status__Married: uint8 [1, 1, 0, 0, 1, 0, 0 ...\nMarital_Status__Single: uint8 [0, 0, 1, 1, 0, 1, 1 ...\nCity__Chicago: uint8 [0, 0, 1, 0, 0, 1, 0 ...\nCity__Houston: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nCity__Los Angeles: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nCity__Miami: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nCity__New York: uint8 [1, 0, 0, 1, 1, 0, 0 ...\nMember_Status__Bronze: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nMember_Status__Gold: uint8 [0, 0, 0, 0, 0, 1, 1 ...\nMember_Status__Platinum: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nMember_Status__Silver: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nNumber_Children__0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nNumber_Children__1: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nNumber_Children__2: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nNumber_Children__3: uint8 [0, 1, 0, 0, 0, 1, 0 ...\nNumber_Children__4: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nOwn_House_Flag__0: uint8 [1, 1, 0, 0, 1, 0, 1 ...\nOwn_House_Flag__1: uint8 [0, 0, 1, 1, 0, 1, 0 ...\nOwn_Car_Count__0: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nOwn_Car_Count__1: uint8 [0, 0, 0, 1, 0, 1, 1 ...\nOwn_Car_Count__2: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nClient__A: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nClient__B: uint8 [0, 0, 0, 0, 0, 0, 0 ...\n\n\n\ndf_correlated = df_binarized.correlate(target='Member_Status__Platinum')\ndf_correlated.head(10)\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n28\nMember_Status\nPlatinum\n1.000000\n\n\n26\nMember_Status\nBronze\n-0.341351\n\n\n29\nMember_Status\nSilver\n-0.332799\n\n\n27\nMember_Status\nGold\n-0.298637\n\n\n30\nNumber_Children\n0\n0.205230\n\n\n8\nAverage_Income\n20131.0_39881.0\n-0.151215\n\n\n0\nAge\n18.0_29.5\n-0.135522\n\n\n11\nAverage_Income\n79381.0_99210.0\n0.128508\n\n\n33\nNumber_Children\n3\n-0.112216\n\n\n9\nAverage_Income\n39881.0_59631.0\n0.109999\n\n\n\n\n\n\n\n\n# Interactive\ndf_correlated.plot_correlation_funnel(\n engine='plotly', \n height=600\n)\n\n\n \n\n\n\n# Static\ndf_correlated.plot_correlation_funnel(\n engine ='plotnine', \n height = 900\n)\n\n\n\n\n<Figure Size: (700 x 900)>" + "objectID": "reference/index.html#datasets", + "href": "reference/index.html#datasets", + "title": "Function reference", + "section": "", + "text": "Practice pytimetk with 13 complementary time series datasets.\n\n\n\nget_available_datasets\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\n\n\nload_dataset\nLoad one of 12 Time Series Datasets." }, { - "objectID": "reference/get_seasonal_frequency.html", - "href": "reference/get_seasonal_frequency.html", - "title": "get_seasonal_frequency", + "objectID": "reference/get_pandas_frequency.html", + "href": "reference/get_pandas_frequency.html", + "title": "get_pandas_frequency", "section": "", - "text": "get_seasonal_frequency(idx, force_regular=False, numeric=False, engine='pandas')\nThe get_seasonal_frequency function returns the seasonal period of a given time series or datetime index." + "text": "get_pandas_frequency(idx, force_regular=False)\nGet the frequency of a pandas Series or DatetimeIndex.\nThe function get_pandas_frequency takes a Pandas Series or DatetimeIndex as input and returns the inferred frequency of the index, with an option to force regular frequency.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter can be either a pd.Series or a pd.DatetimeIndex. It represents the index or the time series data for which we want to determine the frequency.\nrequired\n\n\nforce_regular\nbool\nThe force_regular parameter is a boolean flag that determines whether to force the frequency to be regular. If set to True, the function will convert irregular frequencies to their regular counterparts. For example, if the inferred frequency is ‘B’ (business days), it will be converted to ‘D’ (calendar days). The default value is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe frequency of the given pandas series or datetime index." }, { - "objectID": "reference/get_seasonal_frequency.html#parameters", - "href": "reference/get_seasonal_frequency.html#parameters", - "title": "get_seasonal_frequency", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\nUnion[pd.Series, pd.DatetimeIndex]\nThe idx parameter can be either a pandas Series or a pandas DatetimeIndex. It represents the time index for which you want to calculate the seasonal frequency.\nrequired\n\n\nforce_regular\nbool\nforce_regular is a boolean parameter that determines whether to force the frequency to be regular. If set to True, the function will try to find a regular frequency even if the data is irregular. If set to False, the function will return the actual frequency of the data.\nFalse\n\n\nnumeric\nbool\nThe numeric parameter is a boolean flag that determines whether the output should be in numeric format or a string Pandas Frequency Alias. If numeric is set to True, the output will be a numeric representation of the seasonal period. If numeric is set to False (default), the output will\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the time scale information.\n'pandas'" + "objectID": "reference/get_pandas_frequency.html#parameters", + "href": "reference/get_pandas_frequency.html#parameters", + "title": "get_pandas_frequency", + "section": "", + "text": "Name\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter can be either a pd.Series or a pd.DatetimeIndex. It represents the index or the time series data for which we want to determine the frequency.\nrequired\n\n\nforce_regular\nbool\nThe force_regular parameter is a boolean flag that determines whether to force the frequency to be regular. If set to True, the function will convert irregular frequencies to their regular counterparts. For example, if the inferred frequency is ‘B’ (business days), it will be converted to ‘D’ (calendar days). The default value is False.\nFalse" }, { - "objectID": "reference/get_seasonal_frequency.html#returns", - "href": "reference/get_seasonal_frequency.html#returns", - "title": "get_seasonal_frequency", + "objectID": "reference/get_pandas_frequency.html#returns", + "href": "reference/get_pandas_frequency.html#returns", + "title": "get_pandas_frequency", + "section": "", + "text": "Type\nDescription\n\n\n\n\nstr\nThe frequency of the given pandas series or datetime index." + }, + { + "objectID": "reference/palette_timetk.html", + "href": "reference/palette_timetk.html", + "title": "palette_timetk", + "section": "", + "text": "palette_timetk()\nThe function palette_timetk returns a dictionary of color codes for various colors in the timetk theme." + }, + { + "objectID": "reference/palette_timetk.html#returns", + "href": "reference/palette_timetk.html#returns", + "title": "palette_timetk", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function get_seasonal_frequency returns the seasonal period based\non the input index. If the index is a pd.DatetimeIndex, it is converted to a pd.Series with the name “idx”. The function then calculates the summary frequency of the index using the get_frequency_summary function. It determines the scale and unit of the frequency and adjusts the unit if the scale is" + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function palette_timetk returns a dictionary containing color\nnames as keys and their corresponding hexadecimal color codes as values:" }, { - "objectID": "reference/get_seasonal_frequency.html#examples", - "href": "reference/get_seasonal_frequency.html#examples", - "title": "get_seasonal_frequency", + "objectID": "reference/palette_timetk.html#examples", + "href": "reference/palette_timetk.html#examples", + "title": "palette_timetk", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndates = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\ntk.get_seasonal_frequency(dates)\n\n'1Y'" + "text": "Examples\n\nimport pytimetk as tk\n\ntk.palette_timetk()\n\n{'blue': '#2c3e50',\n 'red': '#e31a1c',\n 'green': '#18BC9C',\n 'yellow': '#CCBE93',\n 'steel_blue': '#a6cee3',\n 'navy_blue': '#1f78b4',\n 'light_green': '#b2df8a',\n 'pink': '#fb9a99',\n 'light_orange': '#fdbf6f',\n 'orange': '#ff7f00',\n 'light_purple': '#cab2d6',\n 'purple': '#6a3d9a'}" }, { - "objectID": "reference/plot_anomaly_decomp.html", - "href": "reference/plot_anomaly_decomp.html", - "title": "plot_anomalies_decomp", + "objectID": "reference/get_date_summary.html", + "href": "reference/get_date_summary.html", + "title": "get_date_summary", "section": "", - "text": "plot_anomalies_decomp(data, date_column, line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomaly Decomposition Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_decomp function takes in data from the anomalize() function, and returns a plot of the anomaly decomposition." + "text": "get_date_summary(idx, engine='pandas')\nReturns a summary of the date-related information, including the number of dates, the time zone, the start date, and the end date." }, { - "objectID": "reference/plot_anomaly_decomp.html#parameters", - "href": "reference/plot_anomaly_decomp.html#parameters", - "title": "plot_anomalies_decomp", + "objectID": "reference/get_date_summary.html#parameters", + "href": "reference/get_date_summary.html#parameters", + "title": "get_date_summary", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomaly Decomposition Plot” by default.\n'Anomaly Decomposition Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DateTimeIndex\nThe parameter idx can be either a pandas Series or a pandas DateTimeIndex. It represents the dates or timestamps for which we want to generate a summary.\nrequired\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating a date summary.\n'pandas'" }, { - "objectID": "reference/plot_anomaly_decomp.html#returns", - "href": "reference/plot_anomaly_decomp.html#returns", - "title": "plot_anomalies_decomp", + "objectID": "reference/get_date_summary.html#returns", + "href": "reference/get_date_summary.html#returns", + "title": "get_date_summary", "section": "Returns", - "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." - }, - { - "objectID": "reference/plot_anomaly_decomp.html#see-also", - "href": "reference/plot_anomaly_decomp.html#see-also", - "title": "plot_anomalies_decomp", - "section": "See Also", - "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA pandas DataFrame with the following columns: - date_n: The number of dates in the index. - date_tz: The time zone of the dates in the index. - date_start: The first date in the index. - date_end: The last date in the index." }, { - "objectID": "reference/plot_anomaly_decomp.html#examples", - "href": "reference/plot_anomaly_decomp.html#examples", - "title": "plot_anomalies_decomp", + "objectID": "reference/get_date_summary.html#examples", + "href": "reference/get_date_summary.html#examples", + "title": "get_date_summary", "section": "Examples", - "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results\nanomalize_df.plot_anomalies_decomp(\"date\")\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotnine', \n )\n)\n\n\n\n\n\n\n\n<Figure Size: (1200 x 800)>" + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n\ntk.get_date_summary(df['order_date'], engine='pandas')\n\ntk.get_date_summary(df['order_date'], engine='polars')\n\n\n\n\n\n\n\n\ndate_n\ndate_tz\ndate_start\ndate_end\n\n\n\n\n0\n2466\nNone\n2011-01-07\n2011-12-28" }, { - "objectID": "reference/plot_anomalies_cleaned.html", - "href": "reference/plot_anomalies_cleaned.html", - "title": "plot_anomalies_cleaned", + "objectID": "reference/is_holiday.html", + "href": "reference/is_holiday.html", + "title": "is_holiday", "section": "", - "text": "plot_anomalies_cleaned(data, date_column, facet_ncol=1, line_color='#2c3e50', line_color_cleaned='#e31a1c', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomalies Cleaned Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_cleaned function takes in data from the anomalize() function, and returns a plot of the anomalies cleaned." + "text": "is_holiday(idx, country_name='UnitedStates', country=None, engine='pandas')\nCheck if a given list of dates are holidays for a specified country.\nNote: This function requires the holidays package to be installed." }, { - "objectID": "reference/plot_anomalies_cleaned.html#parameters", - "href": "reference/plot_anomalies_cleaned.html#parameters", - "title": "plot_anomalies_cleaned", + "objectID": "reference/is_holiday.html#parameters", + "href": "reference/is_holiday.html#parameters", + "title": "is_holiday", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nfacet_ncol\nint\nThe number of columns in the facet grid. It is an optional parameter,\n1\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_color_cleaned\nstr\nThe color of the line in the plot. It is specified as a hexadecimal or a matplotlib color name. The default value is “#e31a1c”.\n'#e31a1c'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomalies Cleaned Plot” by default.\n'Anomalies Cleaned Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\nUnion[str, datetime, List[Union[str, datetime]], pd.Series]\nThe dates to check for holiday status.\nrequired\n\n\ncountry_name\nstr\nThe name of the country for which to check the holiday status. Defaults to ‘UnitedStates’ if not specified.\n'UnitedStates'\n\n\ncountry\nstr\nAn alternative parameter to specify the country for holiday checking, overriding country_name.\nNone\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating the boolean series. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating a boolean of holidays or not holidays. This can be faster than using “pandas” for long series.\n'pandas'" }, { - "objectID": "reference/plot_anomalies_cleaned.html#returns", - "href": "reference/plot_anomalies_cleaned.html#returns", - "title": "plot_anomalies_cleaned", - "section": "Returns", - "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." + "objectID": "reference/is_holiday.html#returns", + "href": "reference/is_holiday.html#returns", + "title": "is_holiday", + "section": "Returns:", + "text": "Returns:\npd.Series: Series containing True if the date is a holiday, False otherwise." }, { - "objectID": "reference/plot_anomalies_cleaned.html#see-also", - "href": "reference/plot_anomalies_cleaned.html#see-also", - "title": "plot_anomalies_cleaned", - "section": "See Also", - "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." + "objectID": "reference/is_holiday.html#raises", + "href": "reference/is_holiday.html#raises", + "title": "is_holiday", + "section": "Raises:", + "text": "Raises:\nValueError: If the specified country is not found in the holidays package." }, { - "objectID": "reference/plot_anomalies_cleaned.html#examples", - "href": "reference/plot_anomalies_cleaned.html#examples", - "title": "plot_anomalies_cleaned", - "section": "Examples", - "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results\nanomalize_df.plot_anomalies_cleaned(\"date\")\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_cleaned(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 600,\n height = 1000,\n x_axis_date_labels = \"%y\",\n engine = 'plotly', \n )\n)" + "objectID": "reference/is_holiday.html#examples", + "href": "reference/is_holiday.html#examples", + "title": "is_holiday", + "section": "Examples:", + "text": "Examples:\n\nimport polars as pl\nimport pytimetk as tk\n\ntk.is_holiday('2023-01-01', country_name='UnitedStates')\n\n0 True\nName: is_holiday, dtype: bool\n\n\n\n# List of dates\ntk.is_holiday(['2023-01-01', '2023-01-02', '2023-01-03'], country_name='UnitedStates')\n\n0 True\n1 True\n2 False\nName: is_holiday, dtype: bool\n\n\n\n# Polars Series\ntk.is_holiday(pl.Series(['2023-01-01', '2023-01-02', '2023-01-03']), country_name='UnitedStates')\n\n0 True\n1 True\n2 False\nName: is_holiday, dtype: bool" }, { - "objectID": "reference/pad_by_time.html", - "href": "reference/pad_by_time.html", - "title": "pad_by_time", + "objectID": "reference/get_available_datasets.html", + "href": "reference/get_available_datasets.html", + "title": "get_available_datasets", "section": "", - "text": "pad_by_time(data, date_column, freq='D', start_date=None, end_date=None)\nMake irregular time series regular by padding with missing dates.\nThe pad_by_time function inserts missing dates into a Pandas DataFrame or DataFrameGroupBy object, through the process making an irregularly spaced time series regularly spaced." - }, - { - "objectID": "reference/pad_by_time.html#parameters", - "href": "reference/pad_by_time.html#parameters", - "title": "pad_by_time", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data parameter can be either a Pandas DataFrame or a Pandas DataFrameGroupBy object. It represents the data that you want to pad with missing dates.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the DataFrame that contains the dates. This column will be used to determine the minimum and maximum dates in theDataFrame, and to generate the regular date range for padding.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the missing timestamps should be generated. It accepts a string representing a pandas frequency alias. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - B: business day frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - BMS: Business month start - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nstart_date\nstr\nSpecifies the start of the padded series. If NULL, it will use the lowest value of the input variable. In the case of groups, it will use the lowest value by group.\nNone\n\n\nend_date\nstr\nSpecifies the end of the padded series. If NULL, it will use the highest value of the input variable. In the case of groups, it will use the highest value by group.\nNone" + "text": "get_available_datasets()\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\nThe get_available_datasets function returns a sorted list of available dataset names from the pytimetk.datasets module. The available datasets are:" }, { - "objectID": "reference/pad_by_time.html#returns", - "href": "reference/pad_by_time.html#returns", - "title": "pad_by_time", + "objectID": "reference/get_available_datasets.html#returns", + "href": "reference/get_available_datasets.html#returns", + "title": "get_available_datasets", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function pad_by_time returns a Pandas DataFrame that has been extended with future dates." - }, - { - "objectID": "reference/pad_by_time.html#notes", - "href": "reference/pad_by_time.html#notes", - "title": "pad_by_time", - "section": "Notes", - "text": "Notes" - }, - { - "objectID": "reference/pad_by_time.html#performance", - "href": "reference/pad_by_time.html#performance", - "title": "pad_by_time", - "section": "Performance", - "text": "Performance\nThis function uses a number of techniques to speed up computation for large datasets with many time series groups.\n\nWe use a vectorized approach to generate the Cartesian product of all unique group values and all dates in the date range.\nWe then merge this Cartesian product with the original data to introduce NaN values for missing rows. This approach is much faster than looping through each group and applying a function to each group.\n\nNote: There is no parallel processing since the vectorized approach is almost always faster." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist\nThe function get_available_datasets returns a sorted list of available dataset names from the pytimetk.datasets module." }, { - "objectID": "reference/pad_by_time.html#examples", - "href": "reference/pad_by_time.html#examples", - "title": "pad_by_time", + "objectID": "reference/get_available_datasets.html#examples", + "href": "reference/get_available_datasets.html#examples", + "title": "get_available_datasets", "section": "Examples", - "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('stocks_daily', parse_dates = ['date'])\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# Pad Single Time Series: Fill missing dates\npadded_df = (\n df\n .query('symbol == \"AAPL\"')\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n )\n)\npadded_df \n\n\n\n\n\n\n\n\ndate\nsymbol\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\n2013-01-02\nAAPL\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n1\n2013-01-03\nAAPL\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n2\n2013-01-04\nAAPL\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n3\n2013-01-05\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\n2013-01-06\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n3910\n2023-09-17\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3911\n2023-09-18\nAAPL\n176.479996\n179.380005\n176.169998\n177.970001\n67257600.0\n177.970001\n\n\n3912\n2023-09-19\nAAPL\n177.520004\n179.630005\n177.130005\n179.070007\n51826900.0\n179.070007\n\n\n3913\n2023-09-20\nAAPL\n179.259995\n179.699997\n175.399994\n175.490005\n58436200.0\n175.490005\n\n\n3914\n2023-09-21\nAAPL\n174.550003\n176.300003\n173.860001\n173.929993\n63047900.0\n173.929993\n\n\n\n\n3915 rows × 8 columns\n\n\n\n\n# Pad by Group: Pad each group with missing dates\npadded_df = (\n df\n .groupby('symbol')\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n )\n)\npadded_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n3\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\nAAPL\n2013-01-06\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\n\n\n\n\n23490 rows × 8 columns\n\n\n\n\n# Pad with end dates specified\npadded_df = (\n df\n .groupby('symbol')\n .pad_by_time(\n date_column = 'date',\n freq = 'D',\n start_date = '2013-01-01',\n end_date = '2023-09-22'\n )\n)\npadded_df.query('symbol == \"AAPL\"')\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nAAPL\n2013-01-01\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n2\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n3\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n4\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n3912\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600.0\n177.970001\n\n\n3913\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900.0\n179.070007\n\n\n3914\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200.0\n175.490005\n\n\n3915\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900.0\n173.929993\n\n\n3916\nAAPL\n2023-09-22\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n3917 rows × 8 columns" + "text": "Examples\n\nimport pytimetk as tk\n\ntk.get_available_datasets()\n\n['bike_sales_sample',\n 'bike_sharing_daily',\n 'expedia',\n 'm4_daily',\n 'm4_hourly',\n 'm4_monthly',\n 'm4_quarterly',\n 'm4_weekly',\n 'm4_yearly',\n 'stocks_daily',\n 'taylor_30_min',\n 'walmart_sales_weekly',\n 'wikipedia_traffic_daily']" }, { - "objectID": "reference/time_scale_template.html", - "href": "reference/time_scale_template.html", - "title": "time_scale_template", + "objectID": "reference/augment_wavelet.html", + "href": "reference/augment_wavelet.html", + "title": "augment_wavelet", "section": "", - "text": "time_scale_template(wide_format=False, engine='pandas')\nThe function time_scale_template returns a table with time scale information in either wide or long format." + "text": "augment_wavelet(data, date_column, value_column, method, sample_rate, scales, reduce_memory=False)\nApply the Wavely transform to specified columns of a DataFrame or DataFrameGroupBy object.\nA wavelet transform is a mathematical tool used to decompose a signal or function into different frequency components and then study each component with a resolution matched to its scale. The wavelet transform uses wavelets, which are functions that are localized in both time and frequency.\nUses:" }, { - "objectID": "reference/time_scale_template.html#parameters", - "href": "reference/time_scale_template.html#parameters", - "title": "time_scale_template", + "objectID": "reference/augment_wavelet.html#parameters", + "href": "reference/augment_wavelet.html#parameters", + "title": "augment_wavelet", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nwide_format\nbool\nThe wide_format parameter determines the format of the output table. If wide_format is set to True, the table will be transposed.\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the time scale information.\n'pandas'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nInput DataFrame or DataFrameGroupBy object with one or more columns of real-valued signals.\nrequired\n\n\nvalue_column\nstr or list\nList of column names in ‘data’ to which the Hilbert transform will be applied.\nrequired\n\n\nsample_rate\nstr\nSampling rate of the input data. For time-series data, the sample rate (sample_rate) typically refers to the frequency at which data points are collected. For example, if your data has a 30-minute interval, if you think of the data in terms of “samples per hour”, the sample rate would be: sample_rate = samples / hour = 1 / 0.5 = 2\nrequired\n\n\nscales\nstr or list\nArray of scales to use in the transform. The choice of scales in wavelet analysis determines which frequencies (or periodicities) in the data you want to analyze. In other words, the scales determine the “window size” or the “look-back period” the wavelet uses to analyze the data. Smaller scales: Correspond to analyzing high-frequency changes (short-term fluctuations) in the data. Larger scales: Correspond to analyzing low-frequency changes (long-term fluctuations) in the data. The specific values for scales depend on what frequencies or periodicities you expect in your data and wish to study. For instance, if you believe there are daily, weekly, and monthly patterns in your data, you’d choose scales that correspond to these periodicities given your sampling rate. For a daily pattern with data at 30-minute intervals: scales = 2 * 24 = 48 because there are 48 half hour intervals in a day For a weekly pattern with data at 30-minute intervals: scales = 48 * 7 = 336 because there are 336 half hour intervals in a week Recommendation, use a range of values to cover both short term and long term patterns, then adjust accordingly.\nrequired\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.\nFalse" }, { - "objectID": "reference/time_scale_template.html#examples", - "href": "reference/time_scale_template.html#examples", - "title": "time_scale_template", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\n\ntk.time_scale_template()\n\n\n\n\n\n\n\n\nmedian_unit\nseasonal_period\ntrend_period\n\n\n\n\n0\nS\n1H\n12H\n\n\n1\nT\n1D\n14D\n\n\n2\nH\n1D\n1M\n\n\n3\nD\n1W\n3M\n\n\n4\nW\n1Q\n1Y\n\n\n5\nM\n1Y\n5Y\n\n\n6\nQ\n1Y\n10Y\n\n\n7\nY\n5Y\n30Y" - }, - { - "objectID": "reference/augment_diffs.html", - "href": "reference/augment_diffs.html", - "title": "augment_diffs", - "section": "", - "text": "augment_diffs(data, date_column, value_column, periods=1, normalize=False, reduce_memory=False, engine='pandas')\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\nThe augment_diffs function takes a Pandas DataFrame or GroupBy object, a date column, a value column or list of value columns, and a period or list of periods, and adds differenced versions of the value columns to the DataFrame." - }, - { - "objectID": "reference/augment_diffs.html#parameters", - "href": "reference/augment_diffs.html#parameters", - "title": "augment_diffs", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data parameter is the input DataFrame or DataFrameGroupBy object that you want to add differenced columns to.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the DataFrame that contains the dates. This column will be used to sort the data before adding the differenced values.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is the column(s) in the DataFrame that you want to add differences values for. It can be either a single column name (string) or a list of column names.\nrequired\n\n\nperiods\nint or tuple or list\nThe periods parameter is an integer, tuple, or list that specifies the periods to shift values when differencing. - If it is an integer, the function will add that number of differences values for each column specified in the value_column parameter. - If it is a tuple, it will generate differences from the first to the second value (inclusive). - If it is a list, it will generate differences based on the values in the list.\n1\n\n\nnormalize\nbool\nThe normalize parameter is used to specify whether to normalize the differenced values as a percentage difference. Default is False.\nFalse\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for augmenting differences. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for augmenting diffs. This can be faster than using “pandas” for large datasets.\n'pandas'" + "objectID": "reference/augment_wavelet.html#returns", + "href": "reference/augment_wavelet.html#returns", + "title": "augment_wavelet", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nDataFrame with added columns for CWT coefficients for each scale, with a real and imaginary column added." }, { - "objectID": "reference/augment_diffs.html#returns", - "href": "reference/augment_diffs.html#returns", - "title": "augment_diffs", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA Pandas DataFrame with differenced columns added to it." + "objectID": "reference/augment_wavelet.html#notes", + "href": "reference/augment_wavelet.html#notes", + "title": "augment_wavelet", + "section": "Notes", + "text": "Notes\nFor a detailed introduction to wavelet transforms, you can visit this website. https://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/\nThe Bump wavelet is a real-valued wavelet function, so its imaginary part is inherently zero.\nIn the continuous wavelet transform (CWT), the Morlet and Analytic Morlet wavelets are complex-valued, so their convolutions with the signal yield complex results (with both real and imaginary parts).\nWavelets, in general, are mathematical functions that can decompose a signal into its constituent parts at different scales. Different wavelet functions are suitable for different types of signals and analytical goals. Let’s look at the three wavelet methods:\n\nMorlet Wavelet:\nCharacteristics: Essentially a complex sinusoid modulated by a Gaussian window. It provides a good balance between time localization and frequency localization.\nWhen to use: When you want a good compromise between time and frequency localization. Particularly useful when you’re interested in sinusoidal components or oscillatory patterns of your data. Commonly used in time-frequency analysis because of its simplicity and effectiveness.\nBump Wavelet:\nCharacteristics: Has an oscillating behavior similar to the Morlet but has sharper time localization. Its frequency localization isn’t as sharp as its time localization.\nWhen to use: When you are more interested in precisely identifying when certain events or anomalies occur in your data. It can be especially useful for detecting sharp spikes or short-lived events in your signal.\nAnalytic Morlet Wavelet:\nCharacteristics: A variation of the Morlet wavelet that is designed to have no negative frequencies when transformed. This means it’s “analytic.” Offers slightly better frequency localization than the standard Morlet wavelet.\nWhen to use: When you’re interested in phase properties of your signal. Can be used when you need to avoid negative frequencies in your analysis, making it useful for certain types of signals, like analytic signals. Offers a cleaner spectrum in the frequency domain than the standard Morlet." }, { - "objectID": "reference/augment_diffs.html#examples", - "href": "reference/augment_diffs.html#examples", - "title": "augment_diffs", + "objectID": "reference/augment_wavelet.html#examples", + "href": "reference/augment_wavelet.html#examples", + "title": "augment_wavelet", "section": "Examples", - "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('m4_daily', parse_dates=['date'])\ndf\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n\n\n9739\nD500\n2012-09-20\n9365.7\n\n\n9740\nD500\n2012-09-21\n9445.9\n\n\n9741\nD500\n2012-09-22\n9497.9\n\n\n9742\nD500\n2012-09-23\n9545.3\n\n\n\n\n9743 rows × 3 columns\n\n\n\n\n# Example 1 - Add 7 differenced values for a single DataFrame object, pandas engine\ndiffed_df_single = (\n df \n .query('id == \"D10\"')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=(1, 7),\n engine='pandas'\n )\n)\ndiffed_df_single.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 674 rows of 10 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Time ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4 ...\nvalue_diff_1: float64 [nan, -2.799999999999727, -24.700000000 ...\nvalue_diff_2: float64 [nan, nan, -27.5, -24.5, -42.2999999999 ...\nvalue_diff_3: float64 [nan, nan, nan, -27.299999999999727, -6 ...\nvalue_diff_4: float64 [nan, nan, nan, nan, -69.79999999999973 ...\nvalue_diff_5: float64 [nan, nan, nan, nan, nan, -58.599999999 ...\nvalue_diff_6: float64 [nan, nan, nan, nan, nan, nan, -57.0999 ...\nvalue_diff_7: float64 [nan, nan, nan, nan, nan, nan, nan, -68 ...\n\n\n\n# Example 2 - Add a single differenced value of 2 for each GroupBy object, polars engine\ndiffed_df = (\n df \n .groupby('id')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=2,\n engine='polars'\n )\n)\ndiffed_df\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_diff_2\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n-27.5\n\n\n3\nD10\n2014-07-06\n2048.9\n-24.5\n\n\n4\nD10\n2014-07-07\n2006.4\n-42.3\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n-18.9\n\n\n9739\nD500\n2012-09-20\n9365.7\n-66.2\n\n\n9740\nD500\n2012-09-21\n9445.9\n27.1\n\n\n9741\nD500\n2012-09-22\n9497.9\n132.2\n\n\n9742\nD500\n2012-09-23\n9545.3\n99.4\n\n\n\n\n9743 rows × 4 columns\n\n\n\n\n# Example 3 add 2 differenced values, 2 and 4, for a single DataFrame object, pandas engine\ndiffed_df_single_two = (\n df \n .query('id == \"D10\"')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=[2, 4],\n engine='pandas'\n )\n)\ndiffed_df_single_two\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_diff_2\nvalue_diff_4\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\nNaN\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n-27.5\nNaN\n\n\n3\nD10\n2014-07-06\n2048.9\n-24.5\nNaN\n\n\n4\nD10\n2014-07-07\n2006.4\n-42.3\n-69.8\n\n\n...\n...\n...\n...\n...\n...\n\n\n669\nD10\n2016-05-02\n2630.7\n57.8\n50.8\n\n\n670\nD10\n2016-05-03\n2649.3\n48.3\n105.3\n\n\n671\nD10\n2016-05-04\n2631.8\n1.1\n58.9\n\n\n672\nD10\n2016-05-05\n2622.5\n-26.8\n21.5\n\n\n673\nD10\n2016-05-06\n2620.1\n-11.7\n-10.6\n\n\n\n\n674 rows × 5 columns" + "text": "Examples\n\n# Example 1: Using Pandas Engine on a pandas groupby object\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.datasets.load_dataset('walmart_sales_weekly', parse_dates = ['Date'])\n\nwavelet_df = (\n df\n .groupby('id')\n .augment_wavelet(\n date_column = 'Date',\n value_column ='Weekly_Sales', \n scales = [15],\n sample_rate =1,\n method = 'bump'\n )\n )\nwavelet_df.head()\n\n\n\n\n\n\n\n\nid\nStore\nDept\nDate\nWeekly_Sales\nIsHoliday\nType\nSize\nTemperature\nFuel_Price\nMarkDown1\nMarkDown2\nMarkDown3\nMarkDown4\nMarkDown5\nCPI\nUnemployment\nbump_scale_15_real\nbump_scale_15_imag\n\n\n\n\n0\n1_1\n1\n1\n2010-02-05\n24924.50\nFalse\nA\n151315\n42.31\n2.572\nNaN\nNaN\nNaN\nNaN\nNaN\n211.096358\n8.106\n28340.714927\n0.0\n\n\n1\n1_1\n1\n1\n2010-02-12\n46039.49\nTrue\nA\n151315\n38.51\n2.548\nNaN\nNaN\nNaN\nNaN\nNaN\n211.242170\n8.106\n32377.869306\n0.0\n\n\n2\n1_1\n1\n1\n2010-02-19\n41595.55\nFalse\nA\n151315\n39.93\n2.514\nNaN\nNaN\nNaN\nNaN\nNaN\n211.289143\n8.106\n36178.125507\n0.0\n\n\n3\n1_1\n1\n1\n2010-02-26\n19403.54\nFalse\nA\n151315\n46.63\n2.561\nNaN\nNaN\nNaN\nNaN\nNaN\n211.319643\n8.106\n39635.989442\n0.0\n\n\n4\n1_1\n1\n1\n2010-03-05\n21827.90\nFalse\nA\n151315\n46.50\n2.625\nNaN\nNaN\nNaN\nNaN\nNaN\n211.350143\n8.106\n42668.587553\n0.0\n\n\n\n\n\n\n\n\n# Example 2: Using Pandas Engine on a pandas dataframe\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('taylor_30_min', parse_dates = ['date'])\n\nresult_df = (\n tk.augment_wavelet(\n df, \n date_column = 'date',\n value_column ='value', \n scales = [15],\n sample_rate =1000,\n method = 'morlet'\n )\n)\n\nresult_df\n\n\n\n\n\n\n\n\ndate\nvalue\nmorlet_scale_15_real\nmorlet_scale_15_imag\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n5.858392e+07\n1.247285e+07\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n5.860706e+07\n1.246976e+07\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n5.862956e+07\n1.246639e+07\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n5.865217e+07\n1.246305e+07\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n5.867501e+07\n1.245981e+07\n\n\n...\n...\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n5.712707e+07\n-1.215821e+07\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n5.709846e+07\n-1.215851e+07\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n5.706991e+07\n-1.215882e+07\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n5.704229e+07\n-1.215955e+07\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n5.701639e+07\n-1.216105e+07\n\n\n\n\n4032 rows × 4 columns" }, { - "objectID": "reference/plot_timeseries.html", - "href": "reference/plot_timeseries.html", - "title": "plot_timeseries", + "objectID": "reference/augment_cmo.html", + "href": "reference/augment_cmo.html", + "title": "augment_cmo", "section": "", - "text": "plot_timeseries(data, date_column, value_column, color_column=None, color_palette=None, facet_ncol=1, facet_nrow=None, facet_scales='free_y', facet_dir='h', line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', smooth=True, smooth_color='#3366FF', smooth_frac=0.2, smooth_size=1.0, smooth_alpha=1.0, legend_show=True, title='Time Series Plot', x_lab='', y_lab='', color_lab='Legend', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly', plotly_dropdown=False, plotly_dropdown_x=0, plotly_dropdown_y=1)\nCreates time series plots using different plotting engines such as Plotnine, Matplotlib, and Plotly." + "text": "augment_cmo(data, date_column, close_column, periods=14, reduce_memory=False, engine='pandas')\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial instrument using either pandas or polars engine, and returns the augmented DataFrame." }, { - "objectID": "reference/plot_timeseries.html#parameters", - "href": "reference/plot_timeseries.html#parameters", - "title": "plot_timeseries", + "objectID": "reference/augment_cmo.html#parameters", + "href": "reference/augment_cmo.html#parameters", + "title": "augment_cmo", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe input data for the plot. It can be either a Pandas DataFrame or a Pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the DataFrame that contains the dates for the time series data.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is used to specify the name of the column in the DataFrame that contains the values for the time series data. This column will be plotted on the y-axis of the time series plot. LONG-FORMAT PLOTTING: If the value_column parameter is a string, it will be treated as a single column name. To plot multiple time series, group the DataFrame first using pd.DataFrame.groupby(). WIDE-FORMAT PLOTTING: If the value_column parameter is a list, it will plotted as multiple time series (wide-format).\nrequired\n\n\ncolor_column\nstr\nThe color_column parameter is an optional parameter that specifies the column in the DataFrame that will be used to assign colors to the different time series. If this parameter is not provided, all time series will have the same color. LONG-FORMAT PLOTTING: The color_column parameter is a single column name. WIDE-FORMAT PLOTTING: The color_column parameter must be the same list as the value_column parameter to color the different time series when performing wide-format plotting.\nNone\n\n\ncolor_palette\nlist\nThe color_palette parameter is used to specify the colors to be used for the different time series. It accepts a list of color codes or names. If the color_column parameter is not provided, the tk.palette_timetk() color palette will be used.\nNone\n\n\nfacet_ncol\nint\nThe facet_ncol parameter determines the number of columns in the facet grid. It specifies how many subplots will be arranged horizontally in the plot.\n1\n\n\nfacet_nrow\nint\nThe facet_nrow parameter determines the number of rows in the facet grid. It specifies how many subplots will be arranged vertically in the grid.\nNone\n\n\nfacet_scales\nstr\nThe facet_scales parameter determines the scaling of the y-axis in the facetted plots. It can take the following values: - “free_y”: The y-axis scale will be free for each facet, but the x-axis scale will be fixed for all facets. This is the default value. - “free_x”: The y-axis scale will be free for each facet, but the x-axis scale will be fixed for all facets. - “free”: The y-axis scale will be free for each facet (subplot). This is the default value.\n'free_y'\n\n\nfacet_dir\nstr\nThe facet_dir parameter determines the direction in which the facets (subplots) are arranged. It can take two possible values: - “h”: The facets will be arranged horizontally (in rows). This is the default value. - “v”: The facets will be arranged vertically (in columns).\n'h'\n\n\nline_color\nstr\nThe line_color parameter is used to specify the color of the lines in the time series plot. It accepts a string value representing a color code or name. The default value is “#2c3e50”, which corresponds to a dark blue color.\n'#2c3e50'\n\n\nline_size\nfloat\nThe line_size parameter is used to specify the size of the lines in the time series plot. It determines the thickness of the lines.\nNone\n\n\nline_type\nstr\nThe line_type parameter is used to specify the type of line to be used in the time series plot.\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the time series plot. It accepts a value between 0 and 1, where 0 means completely transparent (invisible) and 1 means completely opaque (solid).\n1.0\n\n\ny_intercept\nfloat\nThe y_intercept parameter is used to add a horizontal line to the plot at a specific y-value. It can be set to a numeric value to specify the y-value of the intercept. If set to None (default), no y-intercept line will be added to the plot\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line in the plot. It accepts a string value representing a color code or name. The default value is “#2c3e50”, which corresponds to a dark blue color. You can change this value.\n'#2c3e50'\n\n\nx_intercept\nstr\nThe x_intercept parameter is used to add a vertical line at a specific x-axis value on the plot. It is used to highlight a specific point or event in the time series data. - By default, it is set to None, which means no vertical line will be added. - You can use a date string to specify the x-axis value of the intercept. For example, “2020-01-01” would add a vertical line at the beginning of the year 2020.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line that represents the x-intercept in the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this value to any valid color code.\n'#2c3e50'\n\n\nsmooth\nbool\nThe smooth parameter is a boolean indicating whether or not to apply smoothing to the time eries data. If set to True, the time series will be smoothed using the lowess algorithm. The default value is True.\nTrue\n\n\nsmooth_color\nstr\nThe smooth_color parameter is used to specify the color of the smoothed line in the time series plot. It accepts a string value representing a color code or name. The default value is #3366FF, which corresponds to a shade of blue. You can change this value to any valid color code.\n'#3366FF'\n\n\nsmooth_frac\nfloat\nThe smooth_frac parameter is used to control the fraction of data points used for smoothing the time series. It determines the degree of smoothing applied to the data. A smaller value of smooth_frac will result in more smoothing, while a larger value will result in less smoothing. The default value is 0.2.\n0.2\n\n\nsmooth_size\nfloat\nThe smooth_size parameter is used to specify the size of the line used to plot the smoothed values in the time series plot. It is a numeric value that controls the thickness of the line. A larger value will result in a thicker line, while a smaller value will result in a thinner line\n1.0\n\n\nsmooth_alpha\nfloat\nThe smooth_alpha parameter controls the transparency of the smoothed line in the plot. It accepts a value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\nlegend_show\nbool\nThe legend_show parameter is a boolean indicating whether or not to show the legend in the plot. If set to True, the legend will be displayed. The default value is True.\nTrue\n\n\ntitle\nstr\nThe title of the plot.\n'Time Series Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis in the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis in the plot. It is a string that represents the label for the y-axis.\n''\n\n\ncolor_lab\nstr\nThe color_lab parameter is used to specify the label for the legend or color scale in the plot. It is used to provide a description of the colors used in the plot, typically when a color column is specified.\n'Legend'\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2020).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter is used to set the base font size for the plot. It determines the size of the text elements such as axis labels, titles, and legends.\n11\n\n\nwidth\nint\nThe width parameter is used to specify the width of the plot. It determines the horizontal size of the plot in pixels.\nNone\n\n\nheight\nint\nThe height parameter is used to specify the height of the plot in pixels. It determines the vertical size of the plot when it is rendered.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting library to use for creating the time series plot. It can take one of the following values: - “plotly” (interactive): Use the plotly library to create the plot. This is the default value. - “plotnine” (static): Use the plotnine library to create the plot. This is the default value. - “matplotlib” (static): Use the matplotlib library to create the plot.\n'plotly'\n\n\nplotly_dropdown\nbool\nFor analyzing many plots. When set to True and groups are provided, the function switches from faceting to create a dropdown menu to switch between different groups. Default: False.\nFalse\n\n\nplotly_dropdown_x\nfloat\nThe x-axis location of the dropdown. Default: 0.\n0\n\n\nplotly_dropdown_y\nfloat\nThe y-axis location of the dropdown. Default: 1.\n1" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It contains the data on which the Chande Momentum Oscillator (CMO) will be calculated.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates or timestamps.\nrequired\n\n\nclose_column\nstr\nThe close_column parameter is used to specify the column in the input data that contain the values on which the CMO will be calculated.\nrequired\n\n\nperiods\nUnion[int, Tuple[int, int], List[int]]\nThe periods parameter in the augment_cmo function specifies the number of rolling periods over which the Chande Momentum Oscillator (CMO) is calculated. It can be provided as an integer, a tuple of two integers (start and end periods), or a list of integers.\n14\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is a boolean flag that indicates whether or not to reduce the memory usage of the data before performing the CMO calculation. If set to True, the function will attempt to reduce the memory usage of the input data. If set to False, the function will not attempt to reduce the memory usage of the input data.\nFalse\n\n\nengine\nstr\nThe engine parameter specifies the computation engine to use for calculating the Chande Momentum Oscillator (CMO). It can take two values: ‘pandas’ or ‘polars’.\n'pandas'" }, { - "objectID": "reference/plot_timeseries.html#returns", - "href": "reference/plot_timeseries.html#returns", - "title": "plot_timeseries", + "objectID": "reference/augment_cmo.html#returns", + "href": "reference/augment_cmo.html#returns", + "title": "augment_cmo", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function plot_timeseries returns a plot object, depending on the\nspecified engine parameter. - If engine is set to ‘plotnine’ or ‘matplotlib’, the function returns a plot object that can be further customized or displayed. - If engine is set to ‘plotly’, the function returns a plotly figure object." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function augment_cmo returns a pandas DataFrame that contains the augmented data with the Chande Momentum Oscillator (CMO) values added." }, { - "objectID": "reference/plot_timeseries.html#examples", - "href": "reference/plot_timeseries.html#examples", - "title": "plot_timeseries", + "objectID": "reference/augment_cmo.html#notes", + "href": "reference/augment_cmo.html#notes", + "title": "augment_cmo", + "section": "Notes", + "text": "Notes\nThe Chande Momentum Oscillator (CMO), developed by Tushar Chande, is a technical analysis tool used to gauge the momentum of a financial instrument. It is similar to other momentum indicators like the Relative Strength Index (RSI), but with some distinct characteristics. Here’s what the CMO tells us:\nMomentum of Price Movements:\nThe CMO measures the strength of trends in price movements. It calculates the difference between the sum of gains and losses over a specified period, normalized to oscillate between -100 and +100. Overbought and Oversold Conditions:\nValues close to +100 suggest overbought conditions, indicating that the price might be too high and could reverse. Conversely, values near -100 suggest oversold conditions, implying that the price might be too low and could rebound. Trend Strength:\nHigh absolute values (either positive or negative) indicate strong trends, while values near zero suggest a lack of trend or a weak trend. Divergences:\nDivergences between the CMO and price movements can be significant. For example, if the price is making new highs but the CMO is declining, it may indicate weakening momentum and a potential trend reversal. Crossing the Zero Line:\nWhen the CMO crosses above zero, it can be seen as a bullish signal, whereas a cross below zero can be interpreted as bearish. Customization:\nThe period over which the CMO is calculated can be adjusted. A shorter period makes the oscillator more sensitive to price changes, suitable for short-term trading. A longer period smooths out the oscillator for a longer-term perspective. It’s important to note that while the CMO can provide valuable insights into market momentum and potential price reversals, it is most effective when used in conjunction with other indicators and analysis methods. Like all technical indicators, the CMO should not be used in isolation but rather as part of a comprehensive trading strategy.\nReferences: 1. https://www.fmlabs.com/reference/default.htm?url=CMO.htm" + }, + { + "objectID": "reference/augment_cmo.html#examples", + "href": "reference/augment_cmo.html#examples", + "title": "augment_cmo", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\n\ndf = tk.load_dataset('m4_monthly', parse_dates = ['date'])\n\n# Plotly Object: Single Time Series\nfig = (\n df\n .query('id == \"M750\"')\n .plot_timeseries(\n 'date', 'value', \n facet_ncol = 1,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Grouped Time Series (Facets)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n facet_ncol = 2,\n facet_scales = \"free_y\",\n smooth_frac = 0.2,\n smooth_size = 2.0,\n y_intercept = None,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n width = 600,\n height = 500,\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Grouped Time Series (Plotly Dropdown)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n facet_scales = \"free_y\",\n smooth_frac = 0.2,\n smooth_size = 2.0,\n y_intercept = None,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n width = 600,\n height = 500,\n plotly_dropdown = True, # Plotly Dropdown\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Color Column\nfig = (\n df\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n smooth = False,\n y_intercept = 0,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n )\n)\nfig\n\n\n \n\n\n\n# Plotnine Object: Single Time Series\nfig = (\n df\n .query('id == \"M1\"')\n .plot_timeseries(\n 'date', 'value', \n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Plotnine Object: Grouped Time Series\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value',\n facet_ncol = 2,\n facet_scales = \"free\",\n line_size = 0.35,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Plotnine Object: Color Column\nfig = (\n df\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n smooth = False,\n y_intercept = 0,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine',\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Matplotlib object (same as plotnine, but converted to matplotlib object)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'matplotlib',\n )\n)\nfig" + "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('stocks_daily', parse_dates=['date'])\ndf\n\n# Example 1 - Calculate CMO for a single column\ncmo_df = (\n df\n .query(\"symbol == 'AAPL'\")\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28]\n )\n)\ncmo_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n5398\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000\n16.791180\nNaN\nNaN\n\n\n5399\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200\n16.579241\nNaN\nNaN\n\n\n5400\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600\n16.117437\nNaN\nNaN\n\n\n5401\nAAPL\n2013-01-07\n18.642857\n18.903570\n18.400000\n18.710714\n484156400\n16.022623\nNaN\nNaN\n\n\n5402\nAAPL\n2013-01-08\n18.900356\n18.996071\n18.616072\n18.761070\n458707200\n16.065746\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n8092\nAAPL\n2023-09-15\n176.479996\n176.500000\n173.820007\n175.009995\n109205100\n175.009995\n-11.097429\n-6.370009\n\n\n8093\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600\n177.970001\n-6.564165\n-2.713367\n\n\n8094\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900\n179.070007\n-16.295529\n1.931561\n\n\n8095\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200\n175.490005\n-39.175190\n-3.650570\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n\n\n2699 rows × 10 columns\n\n\n\n\n# Example 2 - Calculate CMO for multiple groups\ncmo_df = (\n df\n .groupby('symbol')\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28]\n )\n)\ncmo_df.groupby('symbol').tail(1)\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n2698\nMETA\n2023-09-21\n295.700012\n300.260010\n293.269989\n295.730011\n21300500\n295.730011\n-0.277495\n-4.703549\n\n\n5397\nAMZN\n2023-09-21\n131.940002\n132.240005\n129.309998\n129.330002\n70234800\n129.330002\n-27.450935\n-16.697312\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n10795\nNFLX\n2023-09-21\n386.500000\n395.899994\n383.420013\n384.149994\n5547900\n384.149994\n-56.124625\n-19.430200\n\n\n13494\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000\n410.170013\n-83.624257\n0.671283\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n-33.482367\n2.917383\n\n\n\n\n\n\n\n\n# Example 3 - Calculate CMO for polars engine\ncmo_df = (\n df\n .query(\"symbol == 'AAPL'\")\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28],\n engine='polars'\n )\n)\ncmo_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000\n16.791180\nNaN\nNaN\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200\n16.579241\nNaN\nNaN\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600\n16.117437\nNaN\nNaN\n\n\n3\nAAPL\n2013-01-07\n18.642857\n18.903570\n18.400000\n18.710714\n484156400\n16.022623\nNaN\nNaN\n\n\n4\nAAPL\n2013-01-08\n18.900356\n18.996071\n18.616072\n18.761070\n458707200\n16.065746\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2694\nAAPL\n2023-09-15\n176.479996\n176.500000\n173.820007\n175.009995\n109205100\n175.009995\n-11.097429\n-6.370009\n\n\n2695\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600\n177.970001\n-6.564165\n-2.713367\n\n\n2696\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900\n179.070007\n-16.295529\n1.931561\n\n\n2697\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200\n175.490005\n-39.175190\n-3.650570\n\n\n2698\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n\n\n2699 rows × 10 columns\n\n\n\n\n# Example 4 - Calculate CMO for polars engine and groups\ncmo_df = (\n df\n .groupby('symbol')\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28],\n engine='polars'\n )\n)\ncmo_df.groupby('symbol').tail(1)\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n2698\nMETA\n2023-09-21\n295.700012\n300.260010\n293.269989\n295.730011\n21300500\n295.730011\n-0.277495\n-4.703549\n\n\n5397\nAMZN\n2023-09-21\n131.940002\n132.240005\n129.309998\n129.330002\n70234800\n129.330002\n-27.450935\n-16.697312\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n10795\nNFLX\n2023-09-21\n386.500000\n395.899994\n383.420013\n384.149994\n5547900\n384.149994\n-56.124625\n-19.430200\n\n\n13494\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000\n410.170013\n-83.624257\n0.671283\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n-33.482367\n2.917383" }, { - "objectID": "changelog-news.html", - "href": "changelog-news.html", - "title": "Changelog for Pytimetk", + "objectID": "reference/augment_expanding.html", + "href": "reference/augment_expanding.html", + "title": "augment_expanding", "section": "", - "text": "Integration with timebasedcv #291. New Classes:\n\nTimeSeriesCV(): An enhanced version of TimeBasedSplit() that defaults to mode = \"backwards\", allows for maximum splits using split_limit, and adds enhanced diagnostics like glimpse() and plot()\n\n\n\n\nA plotly dropdown automates the group-wise analysis. Instead of facets, which are only powerful for <=9 plots at a time, a dropdown can easily visualize more plots.\n\nplot_timeseries(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\nplot_anomalies(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\n\n\n\n\n\nplot_timeseries(value_column = list(), color_column=list()): Now supports multiple columns in wide format for grouped time series data visualization. #136" + "text": "augment_expanding(data, date_column, value_column, window_func='mean', min_periods=None, engine='pandas', threads=1, show_progress=True, reduce_memory=False, **kwargs)\nApply one or more Series-based expanding functions to one or more columns of a DataFrame." }, { - "objectID": "changelog-news.html#new-features", - "href": "changelog-news.html#new-features", - "title": "Changelog for Pytimetk", - "section": "", - "text": "Integration with timebasedcv #291. New Classes:\n\nTimeSeriesCV(): An enhanced version of TimeBasedSplit() that defaults to mode = \"backwards\", allows for maximum splits using split_limit, and adds enhanced diagnostics like glimpse() and plot()\n\n\n\n\nA plotly dropdown automates the group-wise analysis. Instead of facets, which are only powerful for <=9 plots at a time, a dropdown can easily visualize more plots.\n\nplot_timeseries(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\nplot_anomalies(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\n\n\n\n\n\nplot_timeseries(value_column = list(), color_column=list()): Now supports multiple columns in wide format for grouped time series data visualization. #136" + "objectID": "reference/augment_expanding.html#parameters", + "href": "reference/augment_expanding.html#parameters", + "title": "augment_expanding", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nvalue_column\nUnion[str, list]\nColumn(s) to which the expanding window functions should be applied. Can be a single column name or a list.\nrequired\n\n\nwindow_func\nUnion[str, list, Tuple[str, Callable]]\nThe window_func parameter in the augment_expanding function specifies the function(s) to be applied to the expanding windows of the value column(s). 1. It can be either: - A string representing the name of a standard function (e.g., ‘mean’, ‘sum’). 2. For custom functions: - Provide a list of tuples. Each tuple should contain a custom name for the function and the function itself. - Each custom function should accept a Pandas Series as its input and operate on that series. Example: (“range”, lambda x: x.max() - x.min()) (See more Examples below.) Note: If your function needs to operate on multiple columns (i.e., it requires access to a DataFrame rather than just a Series), consider using the augment_expanding_apply function in this library.\n'mean'\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\nengine\nstr\nSpecifies the backend computation library for augmenting expanding window functions. The options are: - “pandas” (default): Uses the pandas library. - “polars”: Uses the polars library, which may offer performance benefits for larger datasets.\n'pandas'\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\n**kwargs\nadditional keyword arguments\nAdditional arguments passed to the pandas.Series.expanding method when using the Pandas engine.\n{}" }, { - "objectID": "changelog-news.html#fixes", - "href": "changelog-news.html#fixes", - "title": "Changelog for Pytimetk", - "section": "Fixes:", - "text": "Fixes:\n\ntk.summarize_by_time(): AttributeError: ‘DataFrame’ object has no attribute ‘groupby’ #298" + "objectID": "reference/augment_expanding.html#returns", + "href": "reference/augment_expanding.html#returns", + "title": "augment_expanding", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_expanding function returns a DataFrame with new columns for each applied function, window size, and value column." }, { - "objectID": "changelog-news.html#pandas-and-polars-compatibility", - "href": "changelog-news.html#pandas-and-polars-compatibility", - "title": "Changelog for Pytimetk", - "section": "Pandas and Polars Compatibility:", - "text": "Pandas and Polars Compatibility:\nUpgrading to:\n\npandas >= 2.0.0\npolars >= 1.2.0\n\nUse pytimetk <=0.4.0 to support:\n\npandas <2.0.0\npolars <1.0.0" + "objectID": "reference/augment_expanding.html#notes", + "href": "reference/augment_expanding.html#notes", + "title": "augment_expanding", + "section": "Notes", + "text": "Notes" }, { - "objectID": "changelog-news.html#improvements", - "href": "changelog-news.html#improvements", - "title": "Changelog for Pytimetk", - "section": "Improvements:", - "text": "Improvements:\n\nImplement sort_dataframe(): This function is used internally to make sure Polars and Pandas engines perform grouped operations consistently and correctly. #286 #290\n.augment_lags() and .augment_leads(): value_column now accepts any dtype. #295" + "objectID": "reference/augment_expanding.html#performance", + "href": "reference/augment_expanding.html#performance", + "title": "augment_expanding", + "section": "Performance", + "text": "Performance\n\nPolars Engine (3X faster than Pandas)\nIn most cases, the polars engine will be faster than the pandas engine. Speed tests indicate 3X or more.\n\n\nParallel Processing (Pandas Engine Only)\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." }, { - "objectID": "changelog-news.html#feature-engineering-module", - "href": "changelog-news.html#feature-engineering-module", - "title": "Changelog for Pytimetk", - "section": "Feature Engineering Module:", - "text": "Feature Engineering Module:\n\naugment_pct_change(): pandas and polars engines" + "objectID": "reference/augment_expanding.html#examples", + "href": "reference/augment_expanding.html#examples", + "title": "augment_expanding", + "section": "Examples", + "text": "Examples\n\n# Example 1 - Pandas Backend for Expanding Window Functions\n# This example demonstrates the use of string-named functions \n# on an expanding window using the Pandas backend for computations.\n \nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n \n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n threads = 1, # Disable parallel processing\n show_progress = True, # Display a progress bar\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_mean\nvalue_expanding_std\nvalue_expanding_quantile_75\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.200000\nNaN\n2076.200\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.800000\n1.979899\n2075.500\n\n\n2\nD10\n2014-07-05\n2048.7\n2066.100000\n15.133737\n2074.800\n\n\n3\nD10\n2014-07-06\n2048.9\n2061.800000\n15.054789\n2074.100\n\n\n4\nD10\n2014-07-07\n2006.4\n2050.720000\n27.996732\n2073.400\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n8286.606679\n2456.960489\n9903.550\n\n\n9739\nD500\n2012-09-20\n9365.7\n8286.864035\n2456.723940\n9902.600\n\n\n9740\nD500\n2012-09-21\n9445.9\n8287.140391\n2456.496163\n9902.075\n\n\n9741\nD500\n2012-09-22\n9497.9\n8287.429011\n2456.274422\n9901.550\n\n\n9742\nD500\n2012-09-23\n9545.3\n8287.728789\n2456.058410\n9901.025\n\n\n\n\n9743 rows × 6 columns\n\n\n\n\n# Example 2 - Polars Backend for Expanding Window Functions using Built-Ins \n# (538X Faster than Pandas)\n# This example demonstrates the use of string-named functions and configurable \n# functions using the Polars backend for computations. Configurable functions, \n# like pl_quantile, allow the use of specific parameters associated with their \n# corresponding polars.Expr.rolling_<function_name> method.\n# For instance, pl_quantile corresponds to polars.Expr.rolling_quantile.\n \nimport pytimetk as tk\nimport pandas as pd\nimport polars as pl\nimport numpy as np\nfrom pytimetk.utils.polars_helpers import pl_quantile\nfrom pytimetk.utils.pandas_helpers import pd_quantile\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars', # Utilize Polars for the underlying computations\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_mean\nvalue_expanding_std\nvalue_expanding_quantile_75\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.200000\nNaN\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.800000\n1.979899\n2076.2\n\n\n2\nD10\n2014-07-05\n2048.7\n2066.100000\n15.133737\n2076.2\n\n\n3\nD10\n2014-07-06\n2048.9\n2061.800000\n15.054789\n2076.2\n\n\n4\nD10\n2014-07-07\n2006.4\n2050.720000\n27.996732\n2073.4\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n8286.606679\n2456.960489\n9906.4\n\n\n9739\nD500\n2012-09-20\n9365.7\n8286.864035\n2456.723940\n9902.6\n\n\n9740\nD500\n2012-09-21\n9445.9\n8287.140391\n2456.496163\n9902.6\n\n\n9741\nD500\n2012-09-22\n9497.9\n8287.429011\n2456.274422\n9902.6\n\n\n9742\nD500\n2012-09-23\n9545.3\n8287.728789\n2456.058410\n9902.6\n\n\n\n\n9743 rows × 6 columns\n\n\n\n\n# Example 3 - Lambda Functions for Expanding Window Functions are faster in Pandas than Polars\n# This example demonstrates the use of lambda functions of the form lambda x: x\n# Identity lambda functions, while convenient, have signficantly slower performance.\n# When using lambda functions the Pandas backend will likely be faster than Polars.\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n \n ('range', lambda x: x.max() - x.min()), # Identity lambda function: can be slower, especially in Polars\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_range\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n0.0\n\n\n1\nD10\n2014-07-04\n2073.4\n2.8\n\n\n2\nD10\n2014-07-05\n2048.7\n27.5\n\n\n3\nD10\n2014-07-06\n2048.9\n27.5\n\n\n4\nD10\n2014-07-07\n2006.4\n69.8\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n10782.0\n\n\n9739\nD500\n2012-09-20\n9365.7\n10782.0\n\n\n9740\nD500\n2012-09-21\n9445.9\n10782.0\n\n\n9741\nD500\n2012-09-22\n9497.9\n10782.0\n\n\n9742\nD500\n2012-09-23\n9545.3\n10782.0\n\n\n\n\n9743 rows × 4 columns" }, { - "objectID": "changelog-news.html#finance-module-updates", - "href": "changelog-news.html#finance-module-updates", - "title": "Changelog for Pytimetk", - "section": "Finance Module Updates:", - "text": "Finance Module Updates:\n\naugment_macd(): MACD, pandas and polars engines\naugment_bbands(): Bollinger Bands, pandas and polars engines\naugment_atr(): Average True Range, pandas and polars engines\naugment_ppo(): Percentage Price Oscillator, pandas and polars engines\naugment_rsi(): Relative Strength Index, pandas and polars engines\naugment_qsmomentum(): Quant Science Momentum Indicator, pandas and polars engines\naugment_roc(): Rate of Change (ROC), pandas and polars engines" + "objectID": "reference/ceil_date.html", + "href": "reference/ceil_date.html", + "title": "ceil_date", + "section": "", + "text": "ceil_date(idx, unit='D')\nRobust date ceiling.\nThe ceil_date function takes a pandas Series of dates and returns a new Series with the dates rounded up to the next specified unit. It’s more robust than the pandas ceil function, which does weird things with irregular frequencies like Month which are actually regular." }, { - "objectID": "changelog-news.html#polars-upgrades", - "href": "changelog-news.html#polars-upgrades", - "title": "Changelog for Pytimetk", - "section": "Polars Upgrades", - "text": "Polars Upgrades\n\nMigrate to polars 0.20.7" + "objectID": "reference/ceil_date.html#parameters", + "href": "reference/ceil_date.html#parameters", + "title": "ceil_date", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter is a pandas Series or pandas DatetimeIndex object that contains datetime values. It represents the dates that you want to round down.\nrequired\n\n\nunit\nstr\nThe unit parameter in the ceil_date function is a string that specifies the time unit to which the dates in the idx series should be rounded down. It has a default value of “D”, which stands for day. Other possible values for the unit parameter could be\n'D'" }, { - "objectID": "tutorials/02_finance.html", - "href": "tutorials/02_finance.html", - "title": "Finance Analysis", - "section": "", - "text": "Timetk is designed to work with any time series domain. Arguably the most important is Finance. This tutorial showcases how you can perform Financial Investment and Stock Analysis at scale with pytimetk. This applied tutorial covers financial analysis with:\nLoad the following packages before proceeding with this tutorial.\nCode\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np" + "objectID": "reference/ceil_date.html#returns", + "href": "reference/ceil_date.html#returns", + "title": "ceil_date", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.Series\nThe ceil_date function returns a pandas Series object containing datetime64[ns] values." }, { - "objectID": "tutorials/02_finance.html#application-moving-averages-10-day-and-50-day", - "href": "tutorials/02_finance.html#application-moving-averages-10-day-and-50-day", - "title": "Finance Analysis", - "section": "3.1 Application: Moving Averages, 10-Day and 50-Day", - "text": "3.1 Application: Moving Averages, 10-Day and 50-Day\nThis code template can be used to make and visualize the 10-day and 50-Day moving average of a group of stock symbols. Click to expand the code.\n\nPlotlyPlotnine\n\n\n\n\nCode\n# Add 2 moving averages (10-day and 50-Day)\nsma_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = [10, 50],\n window_func = ['mean'],\n center = False,\n threads = 1, # Change to -1 to use all available cores\n )\n\n# Visualize \n(sma_df \n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_10\", \"adjusted_rolling_mean_win_50\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotly\"\n )\n)\n\n\n\n\n\n\n \n\n\n\n\n\n\nCode\n# Add 2 moving averages (10-day and 50-Day)\nsma_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = [10, 50],\n window_func = ['mean'],\n center = False,\n threads = 1, # Change to -1 to use all available cores\n )\n\n# Visualize \n(sma_df \n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_10\", \"adjusted_rolling_mean_win_50\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotnine\"\n )\n)\n\n\n\n\n\n\n\n\n<Figure Size: (900 x 700)>" + "objectID": "reference/ceil_date.html#examples", + "href": "reference/ceil_date.html#examples", + "title": "ceil_date", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndates = pd.date_range(\"2020-01-01\", \"2020-01-10\", freq=\"1H\")\ndates\n\nDatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 01:00:00',\n '2020-01-01 02:00:00', '2020-01-01 03:00:00',\n '2020-01-01 04:00:00', '2020-01-01 05:00:00',\n '2020-01-01 06:00:00', '2020-01-01 07:00:00',\n '2020-01-01 08:00:00', '2020-01-01 09:00:00',\n ...\n '2020-01-09 15:00:00', '2020-01-09 16:00:00',\n '2020-01-09 17:00:00', '2020-01-09 18:00:00',\n '2020-01-09 19:00:00', '2020-01-09 20:00:00',\n '2020-01-09 21:00:00', '2020-01-09 22:00:00',\n '2020-01-09 23:00:00', '2020-01-10 00:00:00'],\n dtype='datetime64[ns]', length=217, freq='H')\n\n\n\n# Pandas ceil fails on month\n# dates.ceil(\"M\") # ValueError: <MonthEnd> is a non-fixed frequency\n\n# Works on Month\ntk.ceil_date(dates, unit=\"M\")\n\n0 2020-02-01\n1 2020-02-01\n2 2020-02-01\n3 2020-02-01\n4 2020-02-01\n ... \n212 2020-02-01\n213 2020-02-01\n214 2020-02-01\n215 2020-02-01\n216 2020-02-01\nName: idx, Length: 217, dtype: datetime64[ns]" }, { - "objectID": "tutorials/02_finance.html#application-bollinger-bands", - "href": "tutorials/02_finance.html#application-bollinger-bands", - "title": "Finance Analysis", - "section": "3.2 Application: Bollinger Bands", - "text": "3.2 Application: Bollinger Bands\nBollinger Bands are a volatility indicator commonly used in financial trading. They consist of three lines:\n\nThe middle band, which is a simple moving average (usually over 20 periods).\nThe upper band, calculated as the middle band plus k times the standard deviation of the price (typically, k=2).\nThe lower band, calculated as the middle band minus k times the standard deviation of the price.\n\nHere’s how you can calculate and plot Bollinger Bands with pytimetk using this code template (click to expand):\n\nPlotlyPlotnine\n\n\n\n\nCode\n# Bollinger Bands\nbollinger_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = 20,\n window_func = ['mean', 'std'],\n center = False\n ) \\\n .assign(\n upper_band = lambda x: x['adjusted_rolling_mean_win_20'] + 2*x['adjusted_rolling_std_win_20'],\n lower_band = lambda x: x['adjusted_rolling_mean_win_20'] - 2*x['adjusted_rolling_std_win_20']\n )\n\n\n# Visualize\n(bollinger_df\n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_20\", \"upper_band\", \"lower_band\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n # Adjust colors for Bollinger Bands\n color_palette =[\"#2C3E50\", \"#E31A1C\", '#18BC9C', '#18BC9C'],\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotly\" \n )\n)\n\n\n\n\n\n\n \n\n\n\n\n\n\nCode\n# Bollinger Bands\nbollinger_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = 20,\n window_func = ['mean', 'std'],\n center = False\n ) \\\n .assign(\n upper_band = lambda x: x['adjusted_rolling_mean_win_20'] + 2*x['adjusted_rolling_std_win_20'],\n lower_band = lambda x: x['adjusted_rolling_mean_win_20'] - 2*x['adjusted_rolling_std_win_20']\n )\n\n\n# Visualize\n(bollinger_df\n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_20\", \"upper_band\", \"lower_band\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n # Adjust colors for Bollinger Bands\n color_palette =[\"#2C3E50\", \"#E31A1C\", '#18BC9C', '#18BC9C'],\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotnine\"\n )\n)\n\n\n\n\n\n\n\n\n<Figure Size: (900 x 700)>" + "objectID": "reference/augment_rolling_apply.html", + "href": "reference/augment_rolling_apply.html", + "title": "augment_rolling_apply", + "section": "", + "text": "augment_rolling_apply(data, date_column, window_func, window=2, min_periods=None, center=False, threads=1, show_progress=True)\nApply one or more DataFrame-based rolling functions and window sizes to one or more columns of a DataFrame." }, { - "objectID": "tutorials/02_finance.html#returns-analysis-by-time", - "href": "tutorials/02_finance.html#returns-analysis-by-time", - "title": "Finance Analysis", - "section": "4.1 Returns Analysis By Time", - "text": "4.1 Returns Analysis By Time\n\n\n\n\n\n\nReturns are NOT static (so analyze them by time)\n\n\n\n\n\n\nWe can use rolling window calculations with tk.augment_rolling() to compute many rolling features at scale such as rolling mean, std, range (spread).\nWe can expand our tk.augment_rolling_apply() rolling calculations to Rolling Correlation and Rolling Regression (to make comparisons over time)\n\n\n\n\n\nApplication: Descriptive Statistic Analysis\nMany traders compute descriptive statistics like mean, median, mode, skewness, kurtosis, and standard deviation to understand the central tendency, spread, and shape of the return distribution.\n\n\nStep 1: Returns\nUse this code to get the pct_change() in wide format. Click expand to get the code.\n\n\nCode\nreturns_wide_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .pivot(index = 'date', columns = 'symbol', values = 'adjusted') \\\n .pct_change() \\\n .reset_index() \\\n [1:]\n\nreturns_wide_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\n\n\n1\n2013-01-03\n-0.012622\n0.004547\n0.000581\n-0.008214\n0.049777\n0.000786\n\n\n2\n2013-01-04\n-0.027854\n0.002592\n0.019760\n0.035650\n-0.006315\n0.032993\n\n\n3\n2013-01-07\n-0.005883\n0.035925\n-0.004363\n0.022949\n0.033549\n-0.028897\n\n\n4\n2013-01-08\n0.002691\n-0.007748\n-0.001974\n-0.012237\n-0.020565\n-0.021926\n\n\n5\n2013-01-09\n-0.015629\n-0.000113\n0.006573\n0.052650\n-0.012865\n-0.022418\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2694\n2023-09-15\n-0.004154\n-0.029920\n-0.004964\n-0.036603\n-0.008864\n-0.036879\n\n\n2695\n2023-09-18\n0.016913\n-0.002920\n0.004772\n0.007459\n-0.006399\n0.001503\n\n\n2696\n2023-09-19\n0.006181\n-0.016788\n-0.000936\n0.008329\n0.004564\n-0.010144\n\n\n2697\n2023-09-20\n-0.019992\n-0.017002\n-0.030541\n-0.017701\n-0.024987\n-0.029435\n\n\n2698\n2023-09-21\n-0.008889\n-0.044053\n-0.023999\n-0.013148\n-0.005566\n-0.028931\n\n\n\n\n2698 rows × 7 columns\n\n\n\n\n\nStep 2: Descriptive Stats\nUse this code to get standard statistics with the describe() method. Click expand to get the code.\n\n\nCode\nreturns_wide_df.describe()\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\n\n\ncount\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n\n\nmean\n0.001030\n0.001068\n0.000885\n0.001170\n0.001689\n0.002229\n\n\nstd\n0.018036\n0.020621\n0.017267\n0.024291\n0.029683\n0.028320\n\n\nmin\n-0.128647\n-0.140494\n-0.111008\n-0.263901\n-0.351166\n-0.187559\n\n\n25%\n-0.007410\n-0.008635\n-0.006900\n-0.009610\n-0.012071\n-0.010938\n\n\n50%\n0.000892\n0.001050\n0.000700\n0.001051\n0.000544\n0.001918\n\n\n75%\n0.010324\n0.011363\n0.009053\n0.012580\n0.014678\n0.015202\n\n\nmax\n0.119808\n0.141311\n0.160524\n0.296115\n0.422235\n0.298067\n\n\n\n\n\n\n\n\n\nStep 3: Correlation\nAnd run a correlation with corr(). Click expand to get the code.\n\n\nCode\ncorr_table_df = returns_wide_df.drop('date', axis=1).corr()\ncorr_table_df\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\nsymbol\n\n\n\n\n\n\n\n\n\n\nAAPL\n1.000000\n0.497906\n0.566452\n0.479787\n0.321694\n0.526508\n\n\nAMZN\n0.497906\n1.000000\n0.628103\n0.544481\n0.475078\n0.490234\n\n\nGOOG\n0.566452\n0.628103\n1.000000\n0.595728\n0.428470\n0.531382\n\n\nMETA\n0.479787\n0.544481\n0.595728\n1.000000\n0.407417\n0.450586\n\n\nNFLX\n0.321694\n0.475078\n0.428470\n0.407417\n1.000000\n0.380153\n\n\nNVDA\n0.526508\n0.490234\n0.531382\n0.450586\n0.380153\n1.000000\n\n\n\n\n\n\n\n\nThe problem is that the stock market is constantly changing. And these descriptive statistics aren’t representative of the most recent fluctuations. This is where pytimetk comes into play with rolling descriptive statistics.\n\n\n\nApplication: 90-Day Rolling Descriptive Statistics Analysis with tk.augment_rolling()\nLet’s compute and visualize the 90-day rolling statistics.\n\n\n\n\n\n\nGetting More Info: tk.augment_rolling()\n\n\n\n\n\n\nClick here to see our Augmenting Guide\nUse help(tk.augment_rolling) to review additional helpful documentation.\n\n\n\n\n\nStep 1: Long Format Pt.1\nUse this code to get the date melt() into long format. Click expand to get the code.\n\n\nCode\nreturns_long_df = returns_wide_df \\\n .melt(id_vars='date', value_name='returns') \n\nreturns_long_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\n\n\n1\n2013-01-04\nAAPL\n-0.027854\n\n\n2\n2013-01-07\nAAPL\n-0.005883\n\n\n3\n2013-01-08\nAAPL\n0.002691\n\n\n4\n2013-01-09\nAAPL\n-0.015629\n\n\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n\n\n\n\n16188 rows × 3 columns\n\n\n\n\n\nStep 2: Augment Rolling Statistic\nLet’s add multiple columns of rolling statistics. Click to expand the code.\n\n\nCode\nrolling_stats_df = returns_long_df \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'returns',\n window = [90],\n window_func = [\n 'mean', \n 'std', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n 'median',\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max'\n ],\n threads = 1 # Change to -1 to use all threads\n ) \\\n .dropna()\n\nrolling_stats_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_rolling_mean_win_90\nreturns_rolling_std_win_90\nreturns_rolling_min_win_90\nreturns_rolling_q25_win_90\nreturns_rolling_median_win_90\nreturns_rolling_q75_win_90\nreturns_rolling_max_win_90\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n-0.001702\n0.022233\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n-0.001827\n0.022327\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n-0.001894\n0.022414\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.001680\n0.022467\n-0.123558\n-0.010533\n-0.001360\n0.013120\n0.041509\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n-0.001743\n0.022462\n-0.123558\n-0.010533\n-0.001776\n0.013120\n0.041509\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n0.005159\n0.036070\n-0.056767\n-0.012587\n-0.000457\n0.018480\n0.243696\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.005396\n0.035974\n-0.056767\n-0.011117\n0.000177\n0.018480\n0.243696\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n0.005162\n0.036006\n-0.056767\n-0.011117\n-0.000457\n0.018480\n0.243696\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n0.004953\n0.036153\n-0.056767\n-0.012587\n-0.000457\n0.018480\n0.243696\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n0.004724\n0.036303\n-0.056767\n-0.013166\n-0.000457\n0.018480\n0.243696\n\n\n\n\n15654 rows × 10 columns\n\n\n\n\n\nStep 3: Long Format Pt.2\nFinally, we can .melt() each of the rolling statistics for a Long Format Analysis. Click to expand the code.\n\n\nCode\nrolling_stats_long_df = rolling_stats_df \\\n .melt(\n id_vars = [\"symbol\", \"date\"],\n var_name = \"statistic_type\"\n )\n\nrolling_stats_long_df\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nstatistic_type\nvalue\n\n\n\n\n0\nAAPL\n2013-05-13\nreturns\n0.003908\n\n\n1\nAAPL\n2013-05-14\nreturns\n-0.023926\n\n\n2\nAAPL\n2013-05-15\nreturns\n-0.033817\n\n\n3\nAAPL\n2013-05-16\nreturns\n0.013361\n\n\n4\nAAPL\n2013-05-17\nreturns\n-0.003037\n\n\n...\n...\n...\n...\n...\n\n\n125227\nNVDA\n2023-09-15\nreturns_rolling_max_win_90\n0.243696\n\n\n125228\nNVDA\n2023-09-18\nreturns_rolling_max_win_90\n0.243696\n\n\n125229\nNVDA\n2023-09-19\nreturns_rolling_max_win_90\n0.243696\n\n\n125230\nNVDA\n2023-09-20\nreturns_rolling_max_win_90\n0.243696\n\n\n125231\nNVDA\n2023-09-21\nreturns_rolling_max_win_90\n0.243696\n\n\n\n\n125232 rows × 4 columns\n\n\n\nWith the data formatted properly we can evaluate the 90-Day Rolling Statistics using .plot_timeseries().\n\nPlotlyPlotnine\n\n\n\n\nCode\nrolling_stats_long_df \\\n .groupby(['symbol', 'statistic_type']) \\\n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n facet_ncol = 6,\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Statistics\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nrolling_stats_long_df \\\n .groupby(['symbol', 'statistic_type']) \\\n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n facet_ncol = 6,\n facet_dir = 'v',\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Statistics\",\n engine = \"plotnine\"\n )\n\n\n\n\n\n<Figure Size: (1500 x 1000)>" + "objectID": "reference/augment_rolling_apply.html#parameters", + "href": "reference/augment_rolling_apply.html#parameters", + "title": "augment_rolling_apply", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nwindow_func\nUnion[Tuple[str, Callable], List[Tuple[str, Callable]]]\nThe window_func parameter in the augment_rolling_apply function specifies the function(s) that operate on a rolling window with the consideration of multiple columns. The specification can be: - A tuple where the first element is a string representing the function’s name and the second element is the callable function itself. - A list of such tuples for multiple functions. (See more Examples below.) Note: For functions targeting only a single value column without the need for contextual data from other columns, consider using the augment_rolling function in this library.\nrequired\n\n\nwindow\nUnion[int, tuple, list]\nSpecifies the size of the rolling windows. - An integer applies the same window size to all columns in value_column. - A tuple generates windows from the first to the second value (inclusive). - A list of integers designates multiple window sizes for each respective column.\n2\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\ncenter\nbool\nIf True, the rolling window will be centered on the current value. For even-sized windows, the window will be left-biased. Otherwise, it uses a trailing window.\nFalse\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue" }, { - "objectID": "tutorials/02_finance.html#about-rolling-correlation", - "href": "tutorials/02_finance.html#about-rolling-correlation", - "title": "Finance Analysis", - "section": "5.1 About: Rolling Correlation", - "text": "5.1 About: Rolling Correlation\nRolling correlation calculates the correlation between two time series over a rolling window of a specified size, moving one period at a time. In stock analysis, this is often used to assess:\n\nDiversification: Helps in identifying how different stocks move in relation to each other, aiding in the creation of a diversified portfolio.\nMarket Dependency: Measures how a particular stock or sector is correlated with a broader market index.\nRisk Management: Helps in identifying changes in correlation structures over time which is crucial for risk assessment and management.\n\nFor example, if the rolling correlation between two stocks starts increasing, it might suggest that they are being influenced by similar factors or market conditions." + "objectID": "reference/augment_rolling_apply.html#returns", + "href": "reference/augment_rolling_apply.html#returns", + "title": "augment_rolling_apply", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_rolling function returns a DataFrame with new columns for each applied function, window size, and value column." }, { - "objectID": "tutorials/02_finance.html#application-rolling-correlation", - "href": "tutorials/02_finance.html#application-rolling-correlation", - "title": "Finance Analysis", - "section": "5.2 Application: Rolling Correlation", - "text": "5.2 Application: Rolling Correlation\nLet’s revisit the returns wide and long format. We can combine these two using the merge() method.\n\nStep 1: Create the return_combinations_long_df\nPerform data wrangling to get the pairwise combinations in long format:\n\nWe first .merge() to join the long returns with the wide returns by date.\nWe then .melt() to get the wide data into long format.\n\n\n\nCode\nreturn_combinations_long_df = returns_long_df \\\n .merge(returns_wide_df, how='left', on = 'date') \\\n .melt(\n id_vars = ['date', 'symbol', 'returns'],\n var_name = \"comp\",\n value_name = \"returns_comp\"\n )\nreturn_combinations_long_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\ncomp\nreturns_comp\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\nAAPL\n-0.012622\n\n\n1\n2013-01-04\nAAPL\n-0.027854\nAAPL\n-0.027854\n\n\n2\n2013-01-07\nAAPL\n-0.005883\nAAPL\n-0.005883\n\n\n3\n2013-01-08\nAAPL\n0.002691\nAAPL\n0.002691\n\n\n4\n2013-01-09\nAAPL\n-0.015629\nAAPL\n-0.015629\n\n\n...\n...\n...\n...\n...\n...\n\n\n97123\n2023-09-15\nNVDA\n-0.036879\nNVDA\n-0.036879\n\n\n97124\n2023-09-18\nNVDA\n0.001503\nNVDA\n0.001503\n\n\n97125\n2023-09-19\nNVDA\n-0.010144\nNVDA\n-0.010144\n\n\n97126\n2023-09-20\nNVDA\n-0.029435\nNVDA\n-0.029435\n\n\n97127\n2023-09-21\nNVDA\n-0.028931\nNVDA\n-0.028931\n\n\n\n\n97128 rows × 5 columns\n\n\n\n\n\nStep 2: Add Rolling Correlations with tk.augment_rolling_apply()\nNext, let’s add rolling correlations.\n\nWe first .groupby() on the combination of our target assets “symbol” and our comparison asset “comp”.\nThen we use a different function, tk.augment_rolling_apply().\n\n\n\n\n\n\n\ntk.augment_rolling() vs tk.augment_rolling_apply()\n\n\n\n\n\n\nFor the vast majority of operations, tk.augment_rolling() will suffice. It’s used on a single column where there is a simple rolling transformation applied to only the value_column.\nFor more complex cases where other columns beyond a value_column are needed (e.g. rolling correlations, rolling regressions), the tk.augment_rolling_apply() comes to the rescue.\ntk.augment_rolling_apply() exposes the group’s columns as a DataFrame to window function, thus allowing for multi-column analysis.\n\n\n\n\n\n\n\n\n\n\ntk.augment_rolling_apply() has no value_column\n\n\n\n\n\nThis is because the rolling apply passes a DataFrame containing all columns to the custom function. The custom function is then responsible for handling the columns internally. This is how you can select multiple columns to work with.\n\n\n\n\n\nCode\nreturn_corr_df = return_combinations_long_df \\\n .groupby([\"symbol\", \"comp\"]) \\\n .augment_rolling_apply(\n date_column = \"date\",\n window = 90,\n window_func=[('corr', lambda x: x['returns'].corr(x['returns_comp']))],\n threads = 1, # Change to -1 to use all available cores\n )\n\nreturn_corr_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\ncomp\nreturns_comp\nrolling_corr_win_90\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\nAAPL\n-0.012622\nNaN\n\n\n1\n2013-01-04\nAAPL\n-0.027854\nAAPL\n-0.027854\nNaN\n\n\n2\n2013-01-07\nAAPL\n-0.005883\nAAPL\n-0.005883\nNaN\n\n\n3\n2013-01-08\nAAPL\n0.002691\nAAPL\n0.002691\nNaN\n\n\n4\n2013-01-09\nAAPL\n-0.015629\nAAPL\n-0.015629\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n97123\n2023-09-15\nNVDA\n-0.036879\nNVDA\n-0.036879\n1.0\n\n\n97124\n2023-09-18\nNVDA\n0.001503\nNVDA\n0.001503\n1.0\n\n\n97125\n2023-09-19\nNVDA\n-0.010144\nNVDA\n-0.010144\n1.0\n\n\n97126\n2023-09-20\nNVDA\n-0.029435\nNVDA\n-0.029435\n1.0\n\n\n97127\n2023-09-21\nNVDA\n-0.028931\nNVDA\n-0.028931\n1.0\n\n\n\n\n97128 rows × 6 columns\n\n\n\n\n\nStep 3: Visualize the Rolling Correlation\nWe can use tk.plot_timeseries() to visualize the 90-day rolling correlation. It’s interesting to see that stock combinations such as AAPL | AMZN returns have a high positive correlation of 0.80, but this relationship was much lower 0.25 before 2015.\n\nThe blue smoother can help us detect trends\nThe y_intercept is useful in this case to draw lines at -1, 0, and 1\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nreturn_corr_df \\\n .dropna() \\\n .groupby(['symbol', 'comp']) \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"rolling_corr_win_90\",\n facet_ncol = 6,\n y_intercept = [-1,0,1],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Correlation\",\n engine = \"plotly\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nreturn_corr_df \\\n .dropna() \\\n .groupby(['symbol', 'comp']) \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"rolling_corr_win_90\",\n facet_ncol = 6,\n y_intercept = [-1,0,1],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Correlation\",\n engine = \"plotnine\"\n )\n\n\n\n\n\n<Figure Size: (1500 x 1000)>\n\n\n\n\n\nFor comparison, we can examine the corr_table_df from the Descriptive Statistics Analysis:\n\nNotice that the values tend not to match the most recent trends\nFor example APPL | AMZN is correlated at 0.49 over the entire time period. But more recently this correlation has dropped to 0.17 in the 90-Day Rolling Correlation chart.\n\n\n\nCode\ncorr_table_df\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\nsymbol\n\n\n\n\n\n\n\n\n\n\nAAPL\n1.000000\n0.497906\n0.566452\n0.479787\n0.321694\n0.526508\n\n\nAMZN\n0.497906\n1.000000\n0.628103\n0.544481\n0.475078\n0.490234\n\n\nGOOG\n0.566452\n0.628103\n1.000000\n0.595728\n0.428470\n0.531382\n\n\nMETA\n0.479787\n0.544481\n0.595728\n1.000000\n0.407417\n0.450586\n\n\nNFLX\n0.321694\n0.475078\n0.428470\n0.407417\n1.000000\n0.380153\n\n\nNVDA\n0.526508\n0.490234\n0.531382\n0.450586\n0.380153\n1.000000" + "objectID": "reference/augment_rolling_apply.html#notes", + "href": "reference/augment_rolling_apply.html#notes", + "title": "augment_rolling_apply", + "section": "Notes", + "text": "Notes" }, { - "objectID": "tutorials/02_finance.html#about-rolling-regression", - "href": "tutorials/02_finance.html#about-rolling-regression", - "title": "Finance Analysis", - "section": "5.3 About: Rolling Regression", - "text": "5.3 About: Rolling Regression\nRolling regression involves running regression analyses over rolling windows of data points to assess the relationship between a dependent and one or more independent variables. In the context of stock analysis, it can be used to:\n\nBeta Estimation: It can be used to estimate the beta of a stock (a measure of market risk) against a market index over different time periods. A higher beta indicates higher market-related risk.\nMarket Timing: It can be useful in identifying changing relationships between stocks and market indicators, helping traders to adjust their positions accordingly.\nHedge Ratio Determination: It helps in determining the appropriate hedge ratios for pairs trading or other hedging strategies." + "objectID": "reference/augment_rolling_apply.html#performance", + "href": "reference/augment_rolling_apply.html#performance", + "title": "augment_rolling_apply", + "section": "Performance", + "text": "Performance\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." }, { - "objectID": "tutorials/02_finance.html#application-90-day-rolling-regression", - "href": "tutorials/02_finance.html#application-90-day-rolling-regression", - "title": "Finance Analysis", - "section": "5.4 Application: 90-Day Rolling Regression", - "text": "5.4 Application: 90-Day Rolling Regression\n\n\n\n\n\n\nThis Application Requires Scikit Learn\n\n\n\n\n\nWe need to make a regression function that returns the Slope and Intercept. Scikit Learn has an easy-to-use modeling interface. You may need to pip install scikit-learn to use this applied tutorial.\n\n\n\n\nStep 1: Get Market Returns\nFor our purposes, we assume the market is the average returns of the 6 technology stocks.\n\nWe calculate an equal-weight portfolio as the “market returns”.\nThen we merge the market returns into the returns long data.\n\n\n\nCode\n# Assume Market Returns = Equal Weight Portfolio\nmarket_returns_df = returns_wide_df \\\n .set_index(\"date\") \\\n .assign(returns_market = lambda df: df.sum(axis = 1) * (1 / df.shape[1])) \\\n .reset_index() \\\n [['date', 'returns_market']]\n\n# Merge with returns long\nreturns_long_market_df = returns_long_df \\\n .merge(market_returns_df, how='left', on='date')\n\nreturns_long_market_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\n0.005809\n\n\n1\n2013-01-04\nAAPL\n-0.027854\n0.009471\n\n\n2\n2013-01-07\nAAPL\n-0.005883\n0.008880\n\n\n3\n2013-01-08\nAAPL\n0.002691\n-0.010293\n\n\n4\n2013-01-09\nAAPL\n-0.015629\n0.001366\n\n\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\n\n\n\n\n16188 rows × 4 columns\n\n\n\n\n\nStep 2: Run a Rolling Regression\nNext, run the following code to perform a rolling regression:\n\nUse a custom regression function that will return the slope and intercept as a pandas series.\nRun the rolling regression with tk.augment_rolling_apply().\n\n\n\nCode\ndef regression(df):\n \n # External functions must \n from sklearn.linear_model import LinearRegression\n\n model = LinearRegression()\n X = df[['returns_market']] # Extract X values (independent variables)\n y = df['returns'] # Extract y values (dependent variable)\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n\nreturn_regression_df = returns_long_market_df \\\n .groupby('symbol') \\\n .augment_rolling_apply(\n date_column = \"date\",\n window = 90,\n window_func = [('regression', regression)],\n threads = 1, # Change to -1 to use all available cores \n ) \\\n .dropna()\n\nreturn_regression_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\nrolling_regression_win_90\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n0.007082\nIntercept -0.001844 Slope 0.061629 dt...\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n0.007583\nIntercept -0.001959 Slope 0.056540 dt...\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n0.005381\nIntercept -0.002036 Slope 0.062330 dt...\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.009586\nIntercept -0.001789 Slope 0.052348 dt...\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n0.009005\nIntercept -0.001871 Slope 0.055661 dt...\n\n\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\nIntercept 0.000100 Slope 1.805479 dt...\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\nIntercept 0.000207 Slope 1.800813 dt...\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\nIntercept 0.000301 Slope 1.817878 dt...\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\nIntercept 0.000845 Slope 1.825818 dt...\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\nIntercept 0.000901 Slope 1.818710 dt...\n\n\n\n\n15654 rows × 5 columns\n\n\n\n\n\nStep 3: Extract the Slope Coefficient (Beta)\nThis is more of a hack than anything to extract the beta (slope) of the rolling regression.\n\n\nCode\nintercept_slope_df = pd.concat(return_regression_df['rolling_regression_win_90'].to_list(), axis=1).T \n\nintercept_slope_df.index = return_regression_df.index\n\nreturn_beta_df = pd.concat([return_regression_df, intercept_slope_df], axis=1)\n\nreturn_beta_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\nrolling_regression_win_90\nIntercept\nSlope\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n0.007082\nIntercept -0.001844 Slope 0.061629 dt...\n-0.001844\n0.061629\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n0.007583\nIntercept -0.001959 Slope 0.056540 dt...\n-0.001959\n0.056540\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n0.005381\nIntercept -0.002036 Slope 0.062330 dt...\n-0.002036\n0.062330\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.009586\nIntercept -0.001789 Slope 0.052348 dt...\n-0.001789\n0.052348\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n0.009005\nIntercept -0.001871 Slope 0.055661 dt...\n-0.001871\n0.055661\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\nIntercept 0.000100 Slope 1.805479 dt...\n0.000100\n1.805479\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\nIntercept 0.000207 Slope 1.800813 dt...\n0.000207\n1.800813\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\nIntercept 0.000301 Slope 1.817878 dt...\n0.000301\n1.817878\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\nIntercept 0.000845 Slope 1.825818 dt...\n0.000845\n1.825818\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\nIntercept 0.000901 Slope 1.818710 dt...\n0.000901\n1.818710\n\n\n\n\n15654 rows × 7 columns\n\n\n\n\n\nStep 4: Visualize the Rolling Beta\n\nPlotlyPlotnine\n\n\n\n\nCode\nreturn_beta_df \\\n .groupby('symbol') \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"Slope\",\n facet_ncol = 2,\n facet_scales = \"free_x\",\n y_intercept = [0, 3],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n title = \"90-Day Rolling Regression\",\n engine = \"plotly\",\n )\n\n\n\n \n\n\n\n\n\n\nCode\nreturn_beta_df \\\n .groupby('symbol') \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"Slope\",\n facet_ncol = 2,\n facet_scales = \"free_x\",\n y_intercept = [0, 3],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n title = \"90-Day Rolling Regression\",\n engine = \"plotnine\",\n )\n\n\n\n\n\n<Figure Size: (800 x 600)>" + "objectID": "reference/augment_rolling_apply.html#examples", + "href": "reference/augment_rolling_apply.html#examples", + "title": "augment_rolling_apply", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Example 1 - showcasing the rolling correlation between two columns \n# (`value1` and `value2`).\n# The correlation requires both columns as input.\n\n# Sample DataFrame with id, date, value1, and value2 columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [2, 16, 20, 40, 41, 50],\n})\n\n# Compute the rolling correlation for each group of 'id'\n# Using a rolling window of size 3 and a lambda function to calculate the \n# correlation.\n\nrolled_df = (\n df.groupby('id')\n .augment_rolling_apply(\n date_column='date',\n window=3,\n window_func=[('corr', lambda x: x['value1'].corr(x['value2']))], # Lambda function for correlation\n center = False, # Not centering the rolling window\n threads = 1 # Increase threads for parallel processing (use -1 for all cores)\n )\n)\ndisplay(rolled_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nrolling_corr_win_3\n\n\n\n\n0\n1\n2023-01-01\n10\n2\nNaN\n\n\n1\n1\n2023-01-02\n20\n16\nNaN\n\n\n2\n1\n2023-01-03\n29\n20\n0.961054\n\n\n3\n2\n2023-01-04\n42\n40\nNaN\n\n\n4\n2\n2023-01-05\n53\n41\nNaN\n\n\n5\n2\n2023-01-06\n59\n50\n0.824831\n\n\n\n\n\n\n\n\n# Example 2 - Rolling Regression Example: Using `value1` as the dependent \n# variable and `value2` and `value3` as the independent variables. This \n# example demonstrates how to perform a rolling regression using two \n# independent variables.\n\n# Sample DataFrame with `id`, `date`, `value1`, `value2`, and `value3` columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [5, 16, 24, 35, 45, 58],\n 'value3': [2, 3, 6, 9, 10, 13]\n})\n\n# Define Regression Function to be applied on the rolling window.\ndef regression(df):\n\n # Required module (scikit-learn) for regression.\n # This import statement is required inside the function to avoid errors.\n from sklearn.linear_model import LinearRegression\n\n model = LinearRegression()\n X = df[['value2', 'value3']] # Independent variables\n y = df['value1'] # Dependent variable\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n \n# Compute the rolling regression for each group of `id`\n# Using a rolling window of size 3 and the regression function.\nrolled_df = (\n df.groupby('id')\n .augment_rolling_apply(\n date_column='date',\n window=3,\n window_func=[('regression', regression)]\n )\n .dropna()\n)\n\n# Format the results to have each regression output (slope and intercept) in \n# separate columns.\n\nregression_wide_df = pd.concat(rolled_df['rolling_regression_win_3'].to_list(), axis=1).T\n\nregression_wide_df = pd.concat([rolled_df.reset_index(drop = True), regression_wide_df], axis=1)\n\ndisplay(regression_wide_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nvalue3\nrolling_regression_win_3\nIntercept\nSlope\n\n\n\n\n0\n1\n2023-01-03\n29\n24\n6\nIntercept 4.28 Slope 0.84 dtype: flo...\n4.280000\n0.840000\n\n\n1\n2\n2023-01-06\n59\n58\n13\nIntercept 30.352941 Slope 1.588235 ...\n30.352941\n1.588235" }, { - "objectID": "tutorials/03_demand_forecasting.html", - "href": "tutorials/03_demand_forecasting.html", - "title": "Demand Forecasting", + "objectID": "reference/augment_rolling.html", + "href": "reference/augment_rolling.html", + "title": "augment_rolling", "section": "", - "text": "Timetk enables you to generate features from the time column of your data very easily. This tutorial showcases how easy it is to perform time series forecasting with pytimetk. The specific methods we will be using are:" + "text": "augment_rolling(data, date_column, value_column, window_func='mean', window=2, min_periods=None, engine='pandas', center=False, threads=1, show_progress=True, reduce_memory=False, **kwargs)\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame." }, { - "objectID": "tutorials/03_demand_forecasting.html#load-packages", - "href": "tutorials/03_demand_forecasting.html#load-packages", - "title": "Demand Forecasting", - "section": "1.1 Load Packages", - "text": "1.1 Load Packages\nLoad the following packages before proceeding with this tutorial.\n\n\nCode\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\nfrom sklearn.ensemble import RandomForestRegressor\n\n\nThe tutorial is divided into three parts: We will first have a look at the Walmart dataset and perform some preprocessing. Secondly, we will create models based on different features, and see how the time features can be useful. Finally, we will solve the task of time series forecasting, using the features from augment_timeseries_signature, augment_lags, and augment_rolling, to predict future sales." + "objectID": "reference/augment_rolling.html#parameters", + "href": "reference/augment_rolling.html#parameters", + "title": "augment_rolling", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nvalue_column\nUnion[str, list]\nColumn(s) to which the rolling window functions should be applied. Can be a single column name or a list.\nrequired\n\n\nwindow_func\nUnion[str, list, Tuple[str, Callable]]\nThe window_func parameter in the augment_rolling function specifies the function(s) to be applied to the rolling windows of the value column(s). 1. It can be either: - A string representing the name of a standard function (e.g., ‘mean’, ‘sum’). 2. For custom functions: - Provide a list of tuples. Each tuple should contain a custom name for the function and the function itself. - Each custom function should accept a Pandas Series as its input and operate on that series. Example: (“range”, lambda x: x.max() - x.min()) (See more Examples below.) Note: If your function needs to operate on multiple columns (i.e., it requires access to a DataFrame rather than just a Series), consider using the augment_rolling_apply function in this library.\n'mean'\n\n\nwindow\nUnion[int, tuple, list]\nSpecifies the size of the rolling windows. - An integer applies the same window size to all columns in value_column. - A tuple generates windows from the first to the second value (inclusive). - A list of integers designates multiple window sizes for each respective column.\n2\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\ncenter\nbool\nIf True, the rolling window will be centered on the current value. For even-sized windows, the window will be left-biased. Otherwise, it uses a trailing window.\nFalse\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.\nFalse\n\n\nengine\nstr\nSpecifies the backend computation library for augmenting expanding window functions. The options are: - “pandas” (default): Uses the pandas library. - “polars”: Uses the polars library, which may offer performance benefits for larger datasets.\n'pandas'" }, { - "objectID": "tutorials/03_demand_forecasting.html#load-inspect-dataset", - "href": "tutorials/03_demand_forecasting.html#load-inspect-dataset", - "title": "Demand Forecasting", - "section": "1.2 Load & Inspect dataset", - "text": "1.2 Load & Inspect dataset\nThe first thing we want to do is to load the dataset. It is a subset of the Walmart sales prediction Kaggle competition. You can get more insights about the dataset by following this link: walmart_sales_weekly. The most important thing to know about the dataset is that you are provided with some features like the fuel price or whether the week contains holidays and you are expected to predict the weekly sales column for 7 different departments of a given store. Of course, you also have the date for each week, and that is what we can leverage to create additional features.\nLet us start by loading the dataset and cleaning it. Note that we also removed some columns due to * duplication of data * 0 variance * No future data available in current dataset.\n\n\nCode\n# We start by loading the dataset\n# /walmart_sales_weekly.html\ndset = tk.load_dataset('walmart_sales_weekly', parse_dates = ['Date'])\n\ndset = dset.drop(columns=[\n 'id', # This column can be removed as it is equivalent to 'Dept'\n 'Store', # This column has only one possible value\n 'Type', # This column has only one possible value\n 'Size', # This column has only one possible value\n 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',\n 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI',\n 'Unemployment'])\n\ndset.head()\n\n\n\n\n\n\n\n\n\nDept\nDate\nWeekly_Sales\n\n\n\n\n0\n1\n2010-02-05\n24924.50\n\n\n1\n1\n2010-02-12\n46039.49\n\n\n2\n1\n2010-02-19\n41595.55\n\n\n3\n1\n2010-02-26\n19403.54\n\n\n4\n1\n2010-03-05\n21827.90\n\n\n\n\n\n\n\nWe can plot the values of each department to get an idea of how the data looks like. Using the plot_timeseries method with a groupby allows us to create multiple plots by group.\n\n\n\n\n\n\nGetting More Info: tk.plot_timeseries()\n\n\n\n\n\n\nClick here to see our Data Visualization Guide\nUse help(tk.plot_timeseries) to review additional helpful documentation.\n\n\n\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nsales_df = dset\nfig = sales_df.groupby('Dept').plot_timeseries(\n date_column='Date',\n value_column='Weekly_Sales',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly')\nfig\n\n\n\n \n\n\n\n\n\n\nCode\nfig = sales_df.groupby('Dept').plot_timeseries(\n date_column='Date',\n value_column='Weekly_Sales',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine')\nfig\n\n\n\n\n\n<Figure Size: (700 x 500)>" + "objectID": "reference/augment_rolling.html#returns", + "href": "reference/augment_rolling.html#returns", + "title": "augment_rolling", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_rolling function returns a DataFrame with new columns for each applied function, window size, and value column." }, { - "objectID": "tutorials/03_demand_forecasting.html#making-future-dates-easier-with-tk.future_frame", - "href": "tutorials/03_demand_forecasting.html#making-future-dates-easier-with-tk.future_frame", - "title": "Demand Forecasting", - "section": "2.1 Making Future Dates Easier with tk.future_frame", - "text": "2.1 Making Future Dates Easier with tk.future_frame\nWhen building machine learning models, we need to setup our dataframe to hold information about the future. This is the dataframe that will get passed to our model.predict() call. This is made easy with tk.future_frame().\n\n\n\n\n\n\nGetting to know tk.future_frame()\n\n\n\n\n\nCurious about the various options it provides?\n\nClick here to see our Data Wrangling Guide\nUse help(tk.future_frame) to review additional helpful documentation. And explore the plethora of possibilities!\n\n\n\n\nNotice this function adds 5 weeks to our dateset for each department and fills in weekly sales with nulls. Previously our max date was 2012-10-26.\n\n\nCode\nprint(sales_df.groupby('Dept').Date.max())\n\n\nDept\n1 2012-10-26\n3 2012-10-26\n8 2012-10-26\n13 2012-10-26\n38 2012-10-26\n93 2012-10-26\n95 2012-10-26\nName: Date, dtype: datetime64[ns]\n\n\nAfter applying our future frame, we can now see values 5 weeks in the future, and our dataframe has been extended to 2012-11-30 for all groups.\n\n\nCode\nsales_df_with_futureframe = sales_df \\\n .groupby('Dept') \\\n .future_frame(\n date_column = 'Date',\n length_out = 5\n )\n\n\n\n\n\n\n\nCode\nsales_df_with_futureframe.groupby('Dept').Date.max()\n\n\nDept\n1 2012-11-30\n3 2012-11-30\n8 2012-11-30\n13 2012-11-30\n38 2012-11-30\n93 2012-11-30\n95 2012-11-30\nName: Date, dtype: datetime64[ns]" + "objectID": "reference/augment_rolling.html#notes", + "href": "reference/augment_rolling.html#notes", + "title": "augment_rolling", + "section": "Notes", + "text": "Notes" }, { - "objectID": "tutorials/03_demand_forecasting.html#date-features-with-tk.augment_timeseries_signature", - "href": "tutorials/03_demand_forecasting.html#date-features-with-tk.augment_timeseries_signature", - "title": "Demand Forecasting", - "section": "2.2 Date Features with tk.augment_timeseries_signature", - "text": "2.2 Date Features with tk.augment_timeseries_signature\nMachine Learning models generally cannot process raw date objects directly. Moreover, they lack an inherent understanding of the passage of time. This means that, without specific features, a model can’t differentiate between a January observation and a June one. To bridge this gap, the tk.augment_timeseries_signature function is invaluable. It generates 29 distinct date-oriented features suitable for model inputs.\n\n\n\n\n\n\nGetting More Info: tk.augment_timeseries_signature(),tk.augment_lags(), tk.augment_rolling()\n\n\n\n\n\n\nClick here to see our Adding Features (Augmenting)\nUse help(tk.augment_timeseries_signature) help(tk.augment_lags) help(tk.augment_rolling) to review additional helpful documentation.\n\n\n\n\n\nIt’s crucial, however, to align these features with the granularity of your dataset. Given the weekly granularity of the Walmart dataset, any date attributes finer than ‘week’ should be excluded for relevance and efficiency.\n\n\nCode\nsales_df_dates = sales_df_with_futureframe.augment_timeseries_signature(date_column = 'Date')\nsales_df_dates.head(10)\n\n\n\n\n\n\n\n\n\nDept\nDate\nWeekly_Sales\nDate_index_num\nDate_year\nDate_year_iso\nDate_yearstart\nDate_yearend\nDate_leapyear\nDate_half\n...\nDate_mday\nDate_qday\nDate_yday\nDate_weekend\nDate_hour\nDate_minute\nDate_second\nDate_msecond\nDate_nsecond\nDate_am_pm\n\n\n\n\n0\n1\n2010-02-05\n24924.50\n1265328000\n2010\n2010\n0\n0\n0\n1\n...\n5\n36\n36\n0\n0\n0\n0\n0\n0\nam\n\n\n1\n1\n2010-02-12\n46039.49\n1265932800\n2010\n2010\n0\n0\n0\n1\n...\n12\n43\n43\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n1\n2010-02-19\n41595.55\n1266537600\n2010\n2010\n0\n0\n0\n1\n...\n19\n50\n50\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n1\n2010-02-26\n19403.54\n1267142400\n2010\n2010\n0\n0\n0\n1\n...\n26\n57\n57\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n1\n2010-03-05\n21827.90\n1267747200\n2010\n2010\n0\n0\n0\n1\n...\n5\n64\n64\n0\n0\n0\n0\n0\n0\nam\n\n\n5\n1\n2010-03-12\n21043.39\n1268352000\n2010\n2010\n0\n0\n0\n1\n...\n12\n71\n71\n0\n0\n0\n0\n0\n0\nam\n\n\n6\n1\n2010-03-19\n22136.64\n1268956800\n2010\n2010\n0\n0\n0\n1\n...\n19\n78\n78\n0\n0\n0\n0\n0\n0\nam\n\n\n7\n1\n2010-03-26\n26229.21\n1269561600\n2010\n2010\n0\n0\n0\n1\n...\n26\n85\n85\n0\n0\n0\n0\n0\n0\nam\n\n\n8\n1\n2010-04-02\n57258.43\n1270166400\n2010\n2010\n0\n0\n0\n1\n...\n2\n2\n92\n0\n0\n0\n0\n0\n0\nam\n\n\n9\n1\n2010-04-09\n42960.91\n1270771200\n2010\n2010\n0\n0\n0\n1\n...\n9\n9\n99\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n10 rows × 32 columns\n\n\n\nUpon reviewing the generated features, it’s evident that certain attributes don’t align with the granularity of our dataset. For optimal results, features exhibiting no variance—like “Date_hour” due to the weekly nature of our data—should be omitted. We also spot redundant features, such as “Date_Month” and “Date_month_lbl”; both convey month information, albeit in different formats. To enhance clarity and computational efficiency, we’ll refine our dataset to include only the most relevant columns.\nAdditionally, we’ve eliminated certain categorical columns, which, although compatible with models like LightGBM and Catboost, demand extra processing for many tree-based ML models. While 1-hot encoding is a popular method for managing categorical data, it’s not typically recommended for date attributes. Instead, leveraging numeric date features directly, combined with the integration of Fourier features, can effectively capture cyclical patterns.\n\n\nCode\nsales_df_dates.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 1036 rows of 32 columns\nDept: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate: datetime64[ns] [Timestamp('2010-02-05 00:00:00'), ...\nWeekly_Sales: float64 [24924.5, 46039.49, 41595.55, 1940 ...\nDate_index_num: int64 [1265328000, 1265932800, 126653760 ...\nDate_year: int64 [2010, 2010, 2010, 2010, 2010, 201 ...\nDate_year_iso: UInt32 [2010, 2010, 2010, 2010, 2010, 201 ...\nDate_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_leapyear: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_half: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate_quarter: int64 [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...\nDate_quarteryear: object ['2010Q1', '2010Q1', '2010Q1', '20 ...\nDate_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_month: int64 [2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, ...\nDate_month_lbl: object ['February', 'February', 'February ...\nDate_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_monthend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_yweek: UInt32 [5, 6, 7, 8, 9, 10, 11, 12, 13, 14 ...\nDate_mweek: int64 [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, ...\nDate_wday: int64 [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...\nDate_wday_lbl: object ['Friday', 'Friday', 'Friday', 'Fr ...\nDate_mday: int64 [5, 12, 19, 26, 5, 12, 19, 26, 2, ...\nDate_qday: int64 [36, 43, 50, 57, 64, 71, 78, 85, 2 ...\nDate_yday: int64 [36, 43, 50, 57, 64, 71, 78, 85, 9 ...\nDate_weekend: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_hour: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_minute: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_second: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_msecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_nsecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_am_pm: object ['am', 'am', 'am', 'am', 'am', 'am ...\n\n\n\n\nCode\nsales_df_dates = sales_df_dates[[\n 'Date'\n ,'Dept'\n , 'Weekly_Sales'\n , 'Date_year'\n , 'Date_month'\n , 'Date_yweek'\n , 'Date_mweek' \n ]]\nsales_df_dates.tail(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\n\n\n\n\n1026\n2012-11-02\n93\nNaN\n2012\n11\n44\n1\n\n\n1027\n2012-11-09\n93\nNaN\n2012\n11\n45\n2\n\n\n1028\n2012-11-16\n93\nNaN\n2012\n11\n46\n3\n\n\n1029\n2012-11-23\n93\nNaN\n2012\n11\n47\n4\n\n\n1030\n2012-11-30\n93\nNaN\n2012\n11\n48\n5\n\n\n1031\n2012-11-02\n95\nNaN\n2012\n11\n44\n1\n\n\n1032\n2012-11-09\n95\nNaN\n2012\n11\n45\n2\n\n\n1033\n2012-11-16\n95\nNaN\n2012\n11\n46\n3\n\n\n1034\n2012-11-23\n95\nNaN\n2012\n11\n47\n4\n\n\n1035\n2012-11-30\n95\nNaN\n2012\n11\n48\n5" + "objectID": "reference/augment_rolling.html#performance", + "href": "reference/augment_rolling.html#performance", + "title": "augment_rolling", + "section": "Performance", + "text": "Performance\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." }, { - "objectID": "tutorials/03_demand_forecasting.html#lag-features-with-tk.augment_lags", - "href": "tutorials/03_demand_forecasting.html#lag-features-with-tk.augment_lags", - "title": "Demand Forecasting", - "section": "2.3 Lag Features with tk.augment_lags", - "text": "2.3 Lag Features with tk.augment_lags\nAs previously noted, it’s important to recognize that machine learning models lack inherent awareness of time, a vital consideration in time series modeling. Furthermore, these models operate under the assumption that each row is independent, meaning that the information from last month’s weekly sales is not inherently integrated into the prediction of next month’s sales target. To address this limitation, we incorporate additional features, such as lags, into the models to capture temporal dependencies. You can easily achieve this by employing the tk.augment_lags function.\n\n\nCode\ndf_with_lags = sales_df_dates \\\n .groupby('Dept') \\\n .augment_lags(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n lags = [5,6,7,8,9]\n )\ndf_with_lags.head(5)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\n\n\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n2\n2010-02-19\n1\n41595.55\n2010\n2\n7\n3\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3\n2010-02-26\n1\n19403.54\n2010\n2\n8\n4\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\n2010-03-05\n1\n21827.90\n2010\n3\n9\n1\nNaN\nNaN\nNaN\nNaN\nNaN" + "objectID": "reference/augment_rolling.html#examples", + "href": "reference/augment_rolling.html#examples", + "title": "augment_rolling", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\n\n# Example 1 - Using a single window size and a single function name, pandas engine\n# This example demonstrates the use of both string-named functions and lambda \n# functions on a rolling window. We specify a list of window sizes: [2,7]. \n# As a result, the output will have computations for both window sizes 2 and 7.\n# Note - It's preferred to use built-in or configurable functions instead of \n# lambda functions for performance reasons.\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = [2,7], # Specifying multiple window sizes\n window_func = [\n 'mean', # Built-in mean function\n ('std', lambda x: x.std()) # Lambda function to compute standard deviation\n ],\n threads = 1, # Disabling parallel processing\n engine = 'pandas' # Using pandas engine\n )\n)\ndisplay(rolled_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9425.35\n6.55\n9382.071429\n74.335988\n\n\n9739\nD500\n2012-09-20\n9365.7\n9392.25\n26.55\n9396.400000\n58.431303\n\n\n9740\nD500\n2012-09-21\n9445.9\n9405.80\n40.10\n9419.114286\n39.184451\n\n\n9741\nD500\n2012-09-22\n9497.9\n9471.90\n26.00\n9438.928571\n38.945336\n\n\n9742\nD500\n2012-09-23\n9545.3\n9521.60\n23.70\n9449.028571\n53.379416\n\n\n\n\n9743 rows × 7 columns\n\n\n\n\n# Example 2 - Multiple groups, pandas engine\n# Example showcasing the use of string function names and lambda functions \n# applied on rolling windows. The `window` tuple (1,3) will generate window \n# sizes of 1, 2, and 3.\n# Note - It's preferred to use built-in or configurable functions instead of \n# lambda functions for performance reasons.\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,3), # Specifying a range of window sizes\n window_func = [\n 'mean', # Using built-in mean function\n ('std', lambda x: x.std()) # Lambda function for standard deviation\n ],\n threads = 1, # Disabling parallel processing\n engine = 'pandas' # Using pandas engine\n )\n)\ndisplay(rolled_df) \n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_1\nvalue_rolling_std_win_1\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_3\nvalue_rolling_std_win_3\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.2\n0.0\n2076.20\n0.00\n2076.200000\n0.000000\n\n\n1\nD10\n2014-07-04\n2073.4\n2073.4\n0.0\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.7\n0.0\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.9\n0.0\n2048.80\n0.10\n2057.000000\n11.596839\n\n\n4\nD10\n2014-07-07\n2006.4\n2006.4\n0.0\n2027.65\n21.25\n2034.666667\n19.987718\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9418.8\n0.0\n9425.35\n6.55\n9429.466667\n7.905413\n\n\n9739\nD500\n2012-09-20\n9365.7\n9365.7\n0.0\n9392.25\n26.55\n9405.466667\n28.623339\n\n\n9740\nD500\n2012-09-21\n9445.9\n9445.9\n0.0\n9405.80\n40.10\n9410.133333\n33.310092\n\n\n9741\nD500\n2012-09-22\n9497.9\n9497.9\n0.0\n9471.90\n26.00\n9436.500000\n54.378182\n\n\n9742\nD500\n2012-09-23\n9545.3\n9545.3\n0.0\n9521.60\n23.70\n9496.366667\n40.594362\n\n\n\n\n9743 rows × 9 columns\n\n\n\n\n# Example 3 - Multiple groups, polars engine\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,3), # Specifying a range of window sizes\n window_func = [\n 'mean', # Using built-in mean function\n 'std', # Using built-in standard deviation function\n ],\n engine = 'polars' # Using polars engine\n )\n)\ndisplay(rolled_df) \n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_1\nvalue_rolling_std_win_1\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_3\nvalue_rolling_std_win_3\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.2\nNaN\n2076.20\nNaN\n2076.200000\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2073.4\nNaN\n2074.80\n1.979899\n2074.800000\n1.979899\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.7\nNaN\n2061.05\n17.465537\n2066.100000\n15.133737\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.9\nNaN\n2048.80\n0.141421\n2057.000000\n14.203169\n\n\n4\nD10\n2014-07-07\n2006.4\n2006.4\nNaN\n2027.65\n30.052038\n2034.666667\n24.479856\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9418.8\nNaN\n9425.35\n9.263099\n9429.466667\n9.682114\n\n\n9739\nD500\n2012-09-20\n9365.7\n9365.7\nNaN\n9392.25\n37.547370\n9405.466667\n35.056288\n\n\n9740\nD500\n2012-09-21\n9445.9\n9445.9\nNaN\n9405.80\n56.709964\n9410.133333\n40.796364\n\n\n9741\nD500\n2012-09-22\n9497.9\n9497.9\nNaN\n9471.90\n36.769553\n9436.500000\n66.599399\n\n\n9742\nD500\n2012-09-23\n9545.3\n9545.3\nNaN\n9521.60\n33.516861\n9496.366667\n49.717737\n\n\n\n\n9743 rows × 9 columns" }, { - "objectID": "tutorials/03_demand_forecasting.html#rolling-lag-features-with-tk.augment_rolling", - "href": "tutorials/03_demand_forecasting.html#rolling-lag-features-with-tk.augment_rolling", - "title": "Demand Forecasting", - "section": "2.4 Rolling Lag Features with tk.augment_rolling", - "text": "2.4 Rolling Lag Features with tk.augment_rolling\nAnother pivotal aspect of time series analysis involves the utilization of rolling lags. These operations facilitate computations within a moving time window, enabling the use of functions such as “mean” and “std” on these rolling windows. This can be achieved by invoking the tk.augment_rolling() function on grouped time series data. To execute this, we will initially gather all columns containing ‘lag’ in their names. We then apply this function to the lag values, as opposed to the weekly sales, since we lack future weekly sales data. By applying these functions to the lag values, we ensure the prevention of data leakage and maintain the adaptability of our method to unforeseen future data.\n\n\nCode\nlag_columns = [col for col in df_with_lags.columns if 'lag' in col]\n\ndf_with_rolling = df_with_lags \\\n .groupby('Dept') \\\n .augment_rolling(\n date_column = 'Date',\n value_column = lag_columns,\n window = 4,\n window_func = 'mean',\n threads = 1 # Change to -1 to use all available cores\n ) \ndf_with_rolling[df_with_rolling.Dept ==1].head(10)\n\n\n\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\n\n\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n\n\n\nNotice when we add lag values to our dataframe, this creates several NA values. This is because when using lags, there will be some data that is not available early in our dataset.Thus as a result, NA values are introduced.\nTo simplify and clean up the process, we will remove these rows entirely since we already extracted some meaningful information from them (ie. lags, rolling lags).\n\n\nCode\nall_lag_columns = [col for col in df_with_rolling.columns if 'lag' in col]\n\ndf_no_nas = df_with_rolling \\\n .dropna(subset=all_lag_columns, inplace=False)\n\ndf_no_nas.head()\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\n\n\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n\n\n\n\n\nWe can call tk.glimpse() again to quickly see what features we still have available.\n\n\nCode\ndf_no_nas.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 4760 rows of 17 columns\nDate: datetime64[ns] [Timestamp('20 ...\nDept: int64 [1, 1, 1, 1, 1 ...\nWeekly_Sales: float64 [16555.11, 165 ...\nDate_year: int64 [2010, 2010, 2 ...\nDate_month: int64 [4, 4, 4, 4, 4 ...\nDate_yweek: UInt32 [17, 17, 17, 1 ...\nDate_mweek: int64 [5, 5, 5, 5, 5 ...\nWeekly_Sales_lag_5: float64 [26229.21, 262 ...\nWeekly_Sales_lag_6: float64 [22136.64, 221 ...\nWeekly_Sales_lag_7: float64 [21043.39, 210 ...\nWeekly_Sales_lag_8: float64 [21827.9, 2182 ...\nWeekly_Sales_lag_9: float64 [19403.54, 194 ...\nWeekly_Sales_lag_5_rolling_mean_win_4: float64 [22809.285, 22 ...\nWeekly_Sales_lag_6_rolling_mean_win_4: float64 [21102.8675, 2 ...\nWeekly_Sales_lag_7_rolling_mean_win_4: float64 [25967.595, 25 ...\nWeekly_Sales_lag_8_rolling_mean_win_4: float64 [32216.6200000 ...\nWeekly_Sales_lag_9_rolling_mean_win_4: float64 [32990.7700000 ..." + "objectID": "performance/01_speed_comparisons.html", + "href": "performance/01_speed_comparisons.html", + "title": "Speed Comparisons", + "section": "", + "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers speed and performance comparisons using the polars backend.\nBeginning in version 0.2.0 of pytimetk, we introduced new polars engines to many of our functions. This is aimed at leveraging the speed benefits of polars without requiring you (the user) to learn a new data manipulation framework." }, { - "objectID": "tutorials/03_demand_forecasting.html#training-and-future-sets", - "href": "tutorials/03_demand_forecasting.html#training-and-future-sets", - "title": "Demand Forecasting", - "section": "2.5 Training and Future Sets", - "text": "2.5 Training and Future Sets\nNow that we have our training set built, we can start to train our regressor. To do so, let’s first do some model cleanup.\nSplit our data in to train and future sets.\n\n\nCode\nfuture = df_no_nas[df_no_nas.Weekly_Sales.isnull()]\ntrain = df_no_nas[df_no_nas.Weekly_Sales.notnull()]" + "objectID": "performance/01_speed_comparisons.html#key-benefits", + "href": "performance/01_speed_comparisons.html#key-benefits", + "title": "Speed Comparisons", + "section": "2.1 Key benefits:", + "text": "2.1 Key benefits:\n\nYou can get between 2X and 500X speed boost on many common time series operations\nYou don’t need to know how to use polars to gain massive speed boosts\nSimply turn engine = 'polars' to get the speed boost." }, { - "objectID": "tutorials/03_demand_forecasting.html#model-with-regressor", - "href": "tutorials/03_demand_forecasting.html#model-with-regressor", - "title": "Demand Forecasting", - "section": "2.6 Model with regressor", - "text": "2.6 Model with regressor\nWe still have a datetime object in our training data. We will need to remove that before passing to our regressor. Let’s subset our column to just the features we want to use for modeling.\n\n\nCode\ntrain_columns = [ \n 'Dept'\n , 'Date_year'\n , 'Date_month'\n , 'Date_yweek'\n , 'Date_mweek'\n , 'Weekly_Sales_lag_5'\n , 'Weekly_Sales_lag_6'\n , 'Weekly_Sales_lag_7'\n , 'Weekly_Sales_lag_8'\n , 'Weekly_Sales_lag_5_rolling_mean_win_4'\n , 'Weekly_Sales_lag_6_rolling_mean_win_4'\n , 'Weekly_Sales_lag_7_rolling_mean_win_4'\n , 'Weekly_Sales_lag_8_rolling_mean_win_4'\n ]\n\nX = train[train_columns]\ny = train[['Weekly_Sales']]\n\nmodel = RandomForestRegressor(random_state=123)\nmodel = model.fit(X, y)\n\n\nNow that we have a trained model, we can pass in our future frame to predict weekly sales.\n\n\nCode\npredicted_values = model.predict(future[train_columns])\nfuture['y_pred'] = predicted_values\n\nfuture.head(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\ny_pred\n\n\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n\n\n\n\n\nLet’s create a label to split up our actuals from our prediction dataset before recombining.\n\n\nCode\ntrain['type'] = 'actuals'\nfuture['type'] = 'prediction'\n\nfull_df = pd.concat([train, future])\n\nfull_df.head(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\ntype\ny_pred\n\n\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN" + "objectID": "performance/01_speed_comparisons.html#what-affects-speed", + "href": "performance/01_speed_comparisons.html#what-affects-speed", + "title": "Speed Comparisons", + "section": "2.2 What affects speed?", + "text": "2.2 What affects speed?\nMany factors can affect speed. Things that are known to slow performance down:\n\nUsing non-optimized “lambda” functions. Lambda Functions are created at runtime. This process is flexible but extremely inefficient. Where possible use “built-in” or “configurable” functions instead.\nNot using polars. Polars is built on top of Rust, which is a low-level language known for performance and optimized for speed. Using polars usually speeds up computation versus Pandas." }, { - "objectID": "tutorials/03_demand_forecasting.html#pre-visualization-clean-up", - "href": "tutorials/03_demand_forecasting.html#pre-visualization-clean-up", - "title": "Demand Forecasting", - "section": "2.7 Pre-Visualization Clean-up", - "text": "2.7 Pre-Visualization Clean-up\n\n\nCode\nfull_df['Weekly_Sales'] = np.where(full_df.type =='actuals', full_df.Weekly_Sales, full_df.y_pred)" + "objectID": "performance/01_speed_comparisons.html#summarize-by-time-summarize_by_time", + "href": "performance/01_speed_comparisons.html#summarize-by-time-summarize_by_time", + "title": "Speed Comparisons", + "section": "4.1 Summarize By Time summarize_by_time()", + "text": "4.1 Summarize By Time summarize_by_time()\n\nPolars is 13.1X faster than Pandas\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 10\n\ndf_pytimetk = expedia_df[['site_name', 'date_time', 'cnt', 'is_booking']] \\\n .groupby('site_name') \\\n .summarize_by_time(\n date_column = 'date_time',\n value_column = ['cnt', 'is_booking'],\n freq = 'W',\n agg_func = ['sum', 'count'],\n engine = 'polars'\n )\n\n# 50.8 ms ± 2.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 10\n\ndf_pytimetk = expedia_df[['site_name', 'date_time', 'cnt', 'is_booking']] \\\n .groupby('site_name') \\\n .summarize_by_time(\n date_column = 'date_time',\n value_column = ['cnt', 'is_booking'],\n freq = 'W',\n agg_func = ['sum', 'count'],\n engine = 'pandas'\n )\n\n# 668 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)" }, { - "objectID": "tutorials/03_demand_forecasting.html#plot-predictions", - "href": "tutorials/03_demand_forecasting.html#plot-predictions", - "title": "Demand Forecasting", - "section": "2.8 Plot Predictions", - "text": "2.8 Plot Predictions\n\nPlotlyPlotnine\n\n\n\n\nCode\nfull_df \\\n .groupby('Dept') \\\n .plot_timeseries(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nfull_df \\\n .groupby('Dept') \\\n .plot_timeseries(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (800 x 600)>\n\n\n\n\n\nOur weekly sales forecasts exhibit a noticeable alignment with historical trends, indicating that our models are effectively capturing essential data signals. It’s worth noting that with some additional feature engineering, we have the potential to further enhance the model’s performance.\nHere are some additional techniques that can be explored to elevate its performance:\n\nExperiment with the incorporation of various lags using the versatile tk.augment_lags() function.\nEnhance the model’s capabilities by introducing additional rolling calculations through tk.augment_rolling().\nConsider incorporating cyclic features by utilizing tk.augment_fourier().\nTry different models and build a robust cross-validation strategy for model selection.\n\nThese strategies hold promise for refining the model’s accuracy and predictive power" + "objectID": "performance/01_speed_comparisons.html#rolling-calculations-augment_rolling", + "href": "performance/01_speed_comparisons.html#rolling-calculations-augment_rolling", + "title": "Speed Comparisons", + "section": "4.2 Rolling Calculations augment_rolling()", + "text": "4.2 Rolling Calculations augment_rolling()\n\nPolars is 10.8X faster than Pandas\nPolars is 3,517X faster than Pandas with Lambdas\n\n\n\n\n\n\n\nPolarsPandasPandas (Lambda)\n\n\nUses pl_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars',\n )\n)\n# 9.81 ms ± 116 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n\n\n\n\nUses pd_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n # ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n ('quantile_75', pd_quantile(q=0.75))\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n show_progress = False,\n )\n)\n\n# 106 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\nUses lambda x: pd.Series(x).quantile(0.75). Lambda functions are extremely inefficient.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # lambda slows things down\n ],\n min_periods = 1,\n engine = 'pandas', \n show_progress = False,\n )\n)\n\n# 34.5 s ± 236 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" }, { - "objectID": "tutorials/06_correlationfunnel.html", - "href": "tutorials/06_correlationfunnel.html", - "title": "Correlation Funnel", - "section": "", - "text": "We will demonstrate how Correlation Funnel to analyze Expedia Hotel Bookings and which features correlate to a customer making a booking through their website:\n\n\n\nCorrelation Funnel" + "objectID": "performance/01_speed_comparisons.html#augment-expanding-augment_expanding", + "href": "performance/01_speed_comparisons.html#augment-expanding-augment_expanding", + "title": "Speed Comparisons", + "section": "4.3 Augment Expanding augment_expanding()", + "text": "4.3 Augment Expanding augment_expanding()\n\nPolars is 3X faster than Pandas with built-in and configurable functions\nPolars is 515X faster than Pandas with lambda functions\n\n\n\n\n\n\n\nPolarsPandasPandas (Lambda)\n\n\nUses pl_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars',\n )\n)\n# 6.95 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n\n\n\n\nUses pd_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n # ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n ('quantile_75', pd_quantile(q=0.75))\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n )\n)\n\n# 20.8 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\nUses lambda x: pd.Series(x).quantile(0.75). Lambda functions are extremely inefficient.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # lambda slows things down\n ],\n min_periods = 1,\n engine = 'pandas', \n )\n)\n\n# 3.58 s ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" }, { - "objectID": "tutorials/06_correlationfunnel.html#setup", - "href": "tutorials/06_correlationfunnel.html#setup", - "title": "Correlation Funnel", - "section": "3.1 Setup", - "text": "3.1 Setup\nTo set up, import the following packages and the expedia_df dataset, Expedia Hotel Time Series Dataset.\n\n# Libraries\nimport pandas as pd \nimport pytimetk as tk\n\n# Data\nexpedia_df = tk.load_dataset(\"expedia\", parse_dates = ['date_time'])\nexpedia_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 24 columns\ndate_time: datetime64[ns] [Timestamp('2013-07-25 17: ...\nsite_name: int64 [2, 2, 2, 2, 2, 37, 2, 2, ...\nposa_continent: int64 [3, 3, 3, 3, 3, 1, 3, 3, 3 ...\nuser_location_country: int64 [66, 66, 66, 66, 66, 69, 6 ...\nuser_location_region: int64 [174, 174, 174, 220, 351, ...\nuser_location_city: int64 [35675, 31320, 16292, 1760 ...\norig_destination_distance: float64 [0.1203, 108.2251, 763.142 ...\nuser_id: int64 [44735, 794319, 761732, 69 ...\nis_mobile: int64 [0, 0, 1, 0, 0, 0, 0, 0, 0 ...\nis_package: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0 ...\nchannel: int64 [9, 3, 1, 9, 1, 9, 9, 9, 9 ...\nsrch_ci: object ['2013-07-26', '2014-11-27 ...\nsrch_co: object ['2013-07-27', '2014-11-29 ...\nsrch_adults_cnt: int64 [1, 2, 2, 2, 2, 2, 2, 2, 2 ...\nsrch_children_cnt: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0 ...\nsrch_rm_cnt: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1 ...\nsrch_destination_id: int64 [5465, 11620, 23808, 40658 ...\nsrch_destination_type_id: int64 [3, 1, 6, 5, 1, 6, 1, 5, 6 ...\nis_booking: int64 [1, 0, 0, 0, 0, 0, 0, 0, 0 ...\ncnt: int64 [1, 2, 3, 1, 2, 7, 1, 1, 1 ...\nhotel_continent: int64 [2, 2, 2, 2, 2, 6, 4, 2, 4 ...\nhotel_country: int64 [50, 50, 50, 50, 50, 204, ...\nhotel_market: int64 [1230, 369, 1144, 930, 637 ...\nhotel_cluster: int64 [47, 83, 93, 48, 33, 15, 9 ..." + "objectID": "performance/01_speed_comparisons.html#augment-lags-augment_lags", + "href": "performance/01_speed_comparisons.html#augment-lags-augment_lags", + "title": "Speed Comparisons", + "section": "4.4 Augment Lags augment_lags()", + "text": "4.4 Augment Lags augment_lags()\n\nPolars is 1.9X faster than Pandas\nSpeed improvement of Polars (vs Pandas) increases with number of lags\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 25\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_lags(\n date_column = 'date', \n value_column = 'value', \n lags = (2,30),\n engine = 'polars', \n )\n)\n\n# 37.7 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 25\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_lags(\n date_column = 'date', \n value_column = 'value', \n lags = (2,30),\n engine = 'pandas', \n )\n)\n\n# 73.3 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)" }, { - "objectID": "tutorials/06_correlationfunnel.html#data-preparation", - "href": "tutorials/06_correlationfunnel.html#data-preparation", - "title": "Correlation Funnel", - "section": "3.2 Data Preparation", - "text": "3.2 Data Preparation\nTo prepare the dataset, we will first perform data preparation:\n\nAdd time series features based on the date_time timestamp column.\nWe will drop any zero variance features\nDrop additional columns that are not an acceptable data type (i.e. not numeric, categorical, or string) or contain missing values\nConvert numeric columns that start with “hotel_” that are actually categorical “ID” columns to string\n\n\nexpedia_ts_features_df = expedia_df \\\n .augment_timeseries_signature('date_time') \\\n .drop_zero_variance() \\\n .drop(columns=['date_time', 'orig_destination_distance', 'srch_ci', 'srch_co']) \\\n .transform_columns(\n columns = [r\"hotel_.*\"],\n transform_func = lambda x: x.astype(str)\n )\n \nexpedia_ts_features_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 46 columns\nsite_name: int64 [2, 2, 2, 2, 2, 37, 2, 2, 2 ...\nposa_continent: int64 [3, 3, 3, 3, 3, 1, 3, 3, 3, ...\nuser_location_country: int64 [66, 66, 66, 66, 66, 69, 66 ...\nuser_location_region: int64 [174, 174, 174, 220, 351, 7 ...\nuser_location_city: int64 [35675, 31320, 16292, 17605 ...\nuser_id: int64 [44735, 794319, 761732, 696 ...\nis_mobile: int64 [0, 0, 1, 0, 0, 0, 0, 0, 0, ...\nis_package: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nchannel: int64 [9, 3, 1, 9, 1, 9, 9, 9, 9, ...\nsrch_adults_cnt: int64 [1, 2, 2, 2, 2, 2, 2, 2, 2, ...\nsrch_children_cnt: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nsrch_rm_cnt: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nsrch_destination_id: int64 [5465, 11620, 23808, 40658, ...\nsrch_destination_type_id: int64 [3, 1, 6, 5, 1, 6, 1, 5, 6, ...\nis_booking: int64 [1, 0, 0, 0, 0, 0, 0, 0, 0, ...\ncnt: int64 [1, 2, 3, 1, 2, 7, 1, 1, 1, ...\nhotel_continent: object ['2', '2', '2', '2', '2', ' ...\nhotel_country: object ['50', '50', '50', '50', '5 ...\nhotel_market: object ['1230', '369', '1144', '93 ...\nhotel_cluster: object ['47', '83', '93', '48', '3 ...\ndate_time_index_num: int64 [1374773055, 1414939784, 14 ...\ndate_time_year: int64 [2013, 2014, 2014, 2014, 20 ...\ndate_time_year_iso: UInt32 [2013, 2014, 2014, 2014, 20 ...\ndate_time_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_half: int64 [2, 2, 1, 1, 2, 2, 1, 2, 1, ...\ndate_time_quarter: int64 [3, 4, 2, 1, 3, 4, 1, 3, 2, ...\ndate_time_quarteryear: object ['2013Q3', '2014Q4', '2014Q ...\ndate_time_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_month: int64 [7, 11, 5, 2, 8, 12, 3, 9, ...\ndate_time_month_lbl: object ['July', 'November', 'May', ...\ndate_time_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_monthend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_yweek: UInt32 [30, 44, 21, 9, 33, 50, 12, ...\ndate_time_mweek: int64 [4, 1, 4, 4, 2, 2, 3, 3, 2, ...\ndate_time_wday: int64 [4, 7, 4, 3, 3, 2, 2, 1, 4, ...\ndate_time_wday_lbl: object ['Thursday', 'Sunday', 'Thu ...\ndate_time_mday: int64 [25, 2, 22, 26, 13, 9, 18, ...\ndate_time_qday: int64 [25, 33, 52, 57, 44, 70, 77 ...\ndate_time_yday: int64 [206, 306, 142, 57, 225, 34 ...\ndate_time_weekend: int64 [0, 1, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_hour: int64 [17, 14, 12, 14, 11, 7, 21, ...\ndate_time_minute: int64 [24, 49, 50, 1, 15, 21, 40, ...\ndate_time_second: int64 [15, 44, 53, 2, 40, 31, 29, ...\ndate_time_am_pm: object ['pm', 'pm', 'am', 'pm', 'a ..." + "objectID": "performance/01_speed_comparisons.html#chande-momentum-oscillator-cmo-augment_cmo", + "href": "performance/01_speed_comparisons.html#chande-momentum-oscillator-cmo-augment_cmo", + "title": "Speed Comparisons", + "section": "5.1 Chande Momentum Oscillator (CMO) augment_cmo()", + "text": "5.1 Chande Momentum Oscillator (CMO) augment_cmo()\n\nPolars is 3.3X faster than Pandas\nSpeed improvement of Polars (vs Pandas) increases with number of CMO periods\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 25\n\ndf = (\n stocks_daily_df\n .groupby('symbol')\n .augment_cmo(\n date_column = 'date', \n value_column = 'adjusted', \n periods = (5,30),\n engine = 'polars', \n )\n)\n\n# 94.4 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 25\n\ndf = (\n stocks_daily_df\n .groupby('symbol')\n .augment_cmo(\n date_column = 'date', \n value_column = 'adjusted', \n periods = (5,30),\n engine = 'pandas', \n )\n)\n\n# 73.3 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)" }, { - "objectID": "tutorials/06_correlationfunnel.html#step-correlation-funnel-workflow", - "href": "tutorials/06_correlationfunnel.html#step-correlation-funnel-workflow", - "title": "Correlation Funnel", - "section": "3.3 3-Step Correlation Funnel Workflow", - "text": "3.3 3-Step Correlation Funnel Workflow\nNext, we will perform the Correlation Funnel workflow to explore the Expedia Hotel Time Series dataset. There are 3 steps:\n\nBinarize: Convert the data to binary 0/1\nCorrelate: Detect relationships between the binary features and one of the columns (called the target)\nVisualize the Correlation Funnel: Plotting allows us to assess the top features and their relationship to the target.\n\n\nStep 1: Binarize\nUse binarize() to convert the raw data to binary 0/1. Binarization happens as follows:\n\nNumeric Data: Numeric data is Quantile Binned using the pd.qcut() function. The default is 4 bins, which bins numeric data into a maximum of 4 discrete bins. Fewer bins can be returned if there is insufficient data for 4 bins. The number of bins is controlled with the n_bins parameter.\nCategorical / String Data: Categorical data is first processed to determine the most frequent categories. Categories that are sparse are lumped into an “OTHER” category. The lumping can be controlled with the thresh_infreq.\n\n\nexpedia_ts_binarized_df = expedia_ts_features_df.binarize(thresh_infreq = 0.05)\n\nexpedia_ts_binarized_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 155 columns\nsite_name__2.0_15.0: uint8 [1, 1 ...\nsite_name__15.0_53.0: uint8 [0, 0 ...\nuser_location_country__0.0_66.0: uint8 [1, 1 ...\nuser_location_country__66.0_71.0: uint8 [0, 0 ...\nuser_location_country__71.0_239.0: uint8 [0, 0 ...\nuser_location_region__0.0_174.0: uint8 [1, 1 ...\nuser_location_region__174.0_314.0: uint8 [0, 0 ...\nuser_location_region__314.0_385.0: uint8 [0, 0 ...\nuser_location_region__385.0_1021.0: uint8 [0, 0 ...\nuser_location_city__0.0_13087.0: uint8 [0, 0 ...\nuser_location_city__13087.0_27655.0: uint8 [0, 0 ...\nuser_location_city__27655.0_42563.0: uint8 [1, 1 ...\nuser_location_city__42563.0_56507.0: uint8 [0, 0 ...\nuser_id__13.0_299759.8: uint8 [1, 0 ...\nuser_id__299759.8_605161.5: uint8 [0, 0 ...\nuser_id__605161.5_911811.5: uint8 [0, 1 ...\nuser_id__911811.5_1198780.0: uint8 [0, 0 ...\nchannel__0.0_2.0: uint8 [0, 0 ...\nchannel__2.0_9.0: uint8 [1, 1 ...\nchannel__9.0_10.0: uint8 [0, 0 ...\nsrch_adults_cnt__0.0_2.0: uint8 [1, 1 ...\nsrch_adults_cnt__2.0_9.0: uint8 [0, 0 ...\nsrch_children_cnt__0.0_9.0: uint8 [1, 1 ...\nsrch_rm_cnt__0.0_1.0: uint8 [1, 1 ...\nsrch_rm_cnt__1.0_8.0: uint8 [0, 0 ...\nsrch_destination_id__1.0_8267.0: uint8 [1, 0 ...\nsrch_destination_id__8267.0_9147.0: uint8 [0, 0 ...\nsrch_destination_id__9147.0_18998.0: uint8 [0, 1 ...\nsrch_destination_id__18998.0_65104.0: uint8 [0, 0 ...\nsrch_destination_type_id__1.0_5.0: uint8 [1, 1 ...\nsrch_destination_type_id__5.0_9.0: uint8 [0, 0 ...\ncnt__1.0_2.0: uint8 [1, 1 ...\ncnt__2.0_72.0: uint8 [0, 0 ...\ndate_time_index_num__1357516842.0_1382867237.5: uint8 [1, 0 ...\ndate_time_index_num__1382867237.5_1401387689.0: uint8 [0, 0 ...\ndate_time_index_num__1401387689.0_1410981206.0: uint8 [0, 0 ...\ndate_time_index_num__1410981206.0_1420070302.0: uint8 [0, 1 ...\ndate_time_month__1.0_5.0: uint8 [0, 0 ...\ndate_time_month__5.0_7.0: uint8 [1, 0 ...\ndate_time_month__7.0_10.0: uint8 [0, 0 ...\ndate_time_month__10.0_12.0: uint8 [0, 1 ...\ndate_time_yweek__1.0_17.0: uint8 [0, 0 ...\ndate_time_yweek__17.0_30.0: uint8 [1, 0 ...\ndate_time_yweek__30.0_41.0: uint8 [0, 0 ...\ndate_time_yweek__41.0_52.0: uint8 [0, 1 ...\ndate_time_mday__1.0_8.0: uint8 [0, 1 ...\ndate_time_mday__8.0_16.0: uint8 [0, 0 ...\ndate_time_mday__16.0_23.0: uint8 [0, 0 ...\ndate_time_mday__23.0_31.0: uint8 [1, 0 ...\ndate_time_qday__1.0_24.0: uint8 [0, 0 ...\ndate_time_qday__24.0_48.0: uint8 [1, 1 ...\ndate_time_qday__48.0_70.0: uint8 [0, 0 ...\ndate_time_qday__70.0_92.0: uint8 [0, 0 ...\ndate_time_yday__1.0_121.0: uint8 [0, 0 ...\ndate_time_yday__121.0_209.0: uint8 [1, 0 ...\ndate_time_yday__209.0_286.0: uint8 [0, 0 ...\ndate_time_yday__286.0_365.0: uint8 [0, 1 ...\ndate_time_hour__0.0_10.0: uint8 [0, 0 ...\ndate_time_hour__10.0_14.0: uint8 [0, 1 ...\ndate_time_hour__14.0_18.0: uint8 [1, 0 ...\ndate_time_hour__18.0_23.0: uint8 [0, 0 ...\ndate_time_minute__0.0_15.0: uint8 [0, 0 ...\ndate_time_minute__15.0_30.0: uint8 [1, 0 ...\ndate_time_minute__30.0_45.0: uint8 [0, 0 ...\ndate_time_minute__45.0_59.0: uint8 [0, 1 ...\ndate_time_second__0.0_15.0: uint8 [1, 0 ...\ndate_time_second__15.0_30.0: uint8 [0, 0 ...\ndate_time_second__30.0_45.0: uint8 [0, 1 ...\ndate_time_second__45.0_59.0: uint8 [0, 0 ...\nposa_continent__1: uint8 [0, 0 ...\nposa_continent__2: uint8 [0, 0 ...\nposa_continent__3: uint8 [1, 1 ...\nposa_continent__-OTHER: uint8 [0, 0 ...\nis_mobile__0: uint8 [1, 1 ...\nis_mobile__1: uint8 [0, 0 ...\nis_package__0: uint8 [1, 1 ...\nis_package__1: uint8 [0, 0 ...\nis_booking__0: uint8 [0, 1 ...\nis_booking__1: uint8 [1, 0 ...\nhotel_continent__-OTHER: uint8 [0, 0 ...\nhotel_continent__2: uint8 [1, 1 ...\nhotel_continent__3: uint8 [0, 0 ...\nhotel_continent__4: uint8 [0, 0 ...\nhotel_continent__6: uint8 [0, 0 ...\nhotel_country__-OTHER: uint8 [0, 0 ...\nhotel_country__50: uint8 [1, 1 ...\nhotel_country__8: uint8 [0, 0 ...\nhotel_market__-OTHER: uint8 [1, 1 ...\nhotel_cluster__-OTHER: uint8 [1, 1 ...\ndate_time_year__2013: uint8 [1, 0 ...\ndate_time_year__2014: uint8 [0, 1 ...\ndate_time_year_iso__2013: uint8 [1, 0 ...\ndate_time_year_iso__2014: uint8 [0, 1 ...\ndate_time_year_iso__-OTHER: uint8 [0, 0 ...\ndate_time_yearstart__0: uint8 [1, 1 ...\ndate_time_yearstart__-OTHER: uint8 [0, 0 ...\ndate_time_yearend__0: uint8 [1, 1 ...\ndate_time_yearend__-OTHER: uint8 [0, 0 ...\ndate_time_half__1: uint8 [0, 0 ...\ndate_time_half__2: uint8 [1, 1 ...\ndate_time_quarter__1: uint8 [0, 0 ...\ndate_time_quarter__2: uint8 [0, 0 ...\ndate_time_quarter__3: uint8 [1, 0 ...\ndate_time_quarter__4: uint8 [0, 1 ...\ndate_time_quarteryear__2013Q1: uint8 [0, 0 ...\ndate_time_quarteryear__2013Q2: uint8 [0, 0 ...\ndate_time_quarteryear__2013Q3: uint8 [1, 0 ...\ndate_time_quarteryear__2013Q4: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q1: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q2: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q3: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q4: uint8 [0, 1 ...\ndate_time_quarterstart__0: uint8 [1, 1 ...\ndate_time_quarterstart__-OTHER: uint8 [0, 0 ...\ndate_time_quarterend__0: uint8 [1, 1 ...\ndate_time_quarterend__-OTHER: uint8 [0, 0 ...\ndate_time_month_lbl__April: uint8 [0, 0 ...\ndate_time_month_lbl__August: uint8 [0, 0 ...\ndate_time_month_lbl__December: uint8 [0, 0 ...\ndate_time_month_lbl__February: uint8 [0, 0 ...\ndate_time_month_lbl__January: uint8 [0, 0 ...\ndate_time_month_lbl__July: uint8 [1, 0 ...\ndate_time_month_lbl__June: uint8 [0, 0 ...\ndate_time_month_lbl__March: uint8 [0, 0 ...\ndate_time_month_lbl__May: uint8 [0, 0 ...\ndate_time_month_lbl__November: uint8 [0, 1 ...\ndate_time_month_lbl__October: uint8 [0, 0 ...\ndate_time_month_lbl__September: uint8 [0, 0 ...\ndate_time_monthstart__0: uint8 [1, 1 ...\ndate_time_monthstart__-OTHER: uint8 [0, 0 ...\ndate_time_monthend__0: uint8 [1, 1 ...\ndate_time_monthend__-OTHER: uint8 [0, 0 ...\ndate_time_mweek__1: uint8 [0, 1 ...\ndate_time_mweek__2: uint8 [0, 0 ...\ndate_time_mweek__3: uint8 [0, 0 ...\ndate_time_mweek__4: uint8 [1, 0 ...\ndate_time_mweek__5: uint8 [0, 0 ...\ndate_time_wday__1: uint8 [0, 0 ...\ndate_time_wday__2: uint8 [0, 0 ...\ndate_time_wday__3: uint8 [0, 0 ...\ndate_time_wday__4: uint8 [1, 0 ...\ndate_time_wday__5: uint8 [0, 0 ...\ndate_time_wday__6: uint8 [0, 0 ...\ndate_time_wday__7: uint8 [0, 1 ...\ndate_time_wday_lbl__Friday: uint8 [0, 0 ...\ndate_time_wday_lbl__Monday: uint8 [0, 0 ...\ndate_time_wday_lbl__Saturday: uint8 [0, 0 ...\ndate_time_wday_lbl__Sunday: uint8 [0, 1 ...\ndate_time_wday_lbl__Thursday: uint8 [1, 0 ...\ndate_time_wday_lbl__Tuesday: uint8 [0, 0 ...\ndate_time_wday_lbl__Wednesday: uint8 [0, 0 ...\ndate_time_weekend__0: uint8 [1, 0 ...\ndate_time_weekend__1: uint8 [0, 1 ...\ndate_time_am_pm__am: uint8 [0, 0 ...\ndate_time_am_pm__pm: uint8 [1, 1 ...\n\n\n\n\nStep 2: Correlate the data\nNext, we use correlate() to calculate strength of the relationship. The main parameter is target, which should be selected based on the business goal.\nIn this case, we can create a business goal to understand what relates to a website visit count greater than 2. We will select the column: is_booking__1 as the target. This is because we want to know what relates to a hotel room booking via the website search data.\nThis returns a 3 column data frame containing:\n\nfeature: The name of the features\nbin: The bin that corresponds to a bin inside the features\ncorrelation: The strength of the relationship (0 to 1) and the direction of the relationship (+/-)\n\n\nexpedia_ts_correlate_df = expedia_ts_binarized_df.correlate('is_booking__1')\n\nexpedia_ts_correlate_df\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n77\nis_booking\n0\n-1.000000\n\n\n78\nis_booking\n1\n1.000000\n\n\n32\ncnt\n2.0_72.0\n-0.099372\n\n\n31\ncnt\n1.0_2.0\n0.099372\n\n\n75\nis_package\n0\n0.075930\n\n\n...\n...\n...\n...\n\n\n131\ndate_time_monthend\n-OTHER\n0.000182\n\n\n108\ndate_time_quarteryear\n2014Q1\n-0.000041\n\n\n22\nsrch_children_cnt\n0.0_9.0\nNaN\n\n\n87\nhotel_market\n-OTHER\nNaN\n\n\n88\nhotel_cluster\n-OTHER\nNaN\n\n\n\n\n155 rows × 3 columns\n\n\n\n\n\nStep 3: Plot the Correlation funnel\nIt’s in this step where we can visualize review the correlations and determine which features relate to the target, the strength of the relationship (magnitude between 0 and 1), and the direction of the relationship (+/-).\n\nexpedia_ts_correlate_df.plot_correlation_funnel(\n engine = 'plotly',\n height = 800\n)" + "objectID": "tutorials/01_sales_crm.html", + "href": "tutorials/01_sales_crm.html", + "title": "Sales Analysis", + "section": "", + "text": "In this tutorial, we will use pytimetk and its powerful functions to perform a time series analysis on a dataset representing bike sales. Our goal is to understand the patterns in the data and forecast future sales. You will:" }, { - "objectID": "guides/05_augmenting.html", - "href": "guides/05_augmenting.html", - "title": "Adding Features (Augmenting)", - "section": "", - "text": "This section will cover the augment set of functions, use to add many additional time series features to a dataset. We’ll cover how to use the following set of functions" + "objectID": "tutorials/01_sales_crm.html#load-packages.", + "href": "tutorials/01_sales_crm.html#load-packages.", + "title": "Sales Analysis", + "section": "1.1 Load Packages.", + "text": "1.1 Load Packages.\nIf you do not have pytimetk installed, you can install by using\npip install pytimetk\nor for the latest features and functionality, you can install the development version.\npip install git+https://github.com/business-science/pytimetk.git\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split" }, { - "objectID": "guides/05_augmenting.html#basic-examples", - "href": "guides/05_augmenting.html#basic-examples", - "title": "Adding Features (Augmenting)", - "section": "1.1 Basic Examples", - "text": "1.1 Basic Examples\nAdd 1 or more lags / leads to a dataset:\n\n\nCode\n# import libraries\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\nimport random\n\n# create sample data\ndates = pd.date_range(start = '2023-09-18', end = '2023-09-24')\nvalues = [random.randint(10, 50) for _ in range(7)]\n\ndf = pd.DataFrame({\n 'date': dates,\n 'value': values\n})\n\ndf\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2023-09-18\n25\n\n\n1\n2023-09-19\n50\n\n\n2\n2023-09-20\n49\n\n\n3\n2023-09-21\n45\n\n\n4\n2023-09-22\n48\n\n\n5\n2023-09-23\n18\n\n\n6\n2023-09-24\n18\n\n\n\n\n\n\n\nCreate lag / lead of 3 days:\n\nLagLead\n\n\n\n\nCode\n# augment lag\ndf \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = 3\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lag_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\nNaN\n\n\n3\n2023-09-21\n45\n25.0\n\n\n4\n2023-09-22\n48\n50.0\n\n\n5\n2023-09-23\n18\n49.0\n\n\n6\n2023-09-24\n18\n45.0\n\n\n\n\n\n\n\n\n\n\n\nCode\n# augment leads\ndf \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = 3\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lead_3\n\n\n\n\n0\n2023-09-18\n25\n45.0\n\n\n1\n2023-09-19\n50\n48.0\n\n\n2\n2023-09-20\n49\n18.0\n\n\n3\n2023-09-21\n45\n18.0\n\n\n4\n2023-09-22\n48\nNaN\n\n\n5\n2023-09-23\n18\nNaN\n\n\n6\n2023-09-24\n18\nNaN\n\n\n\n\n\n\n\n\n\n\nWe can create multiple lag / lead values for a single time series:\n\nLagLead\n\n\n\n\nCode\n# multiple lagged values for a single time series\ndf \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = (1, 3)\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lag_1\nvalue_lag_2\nvalue_lag_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\nNaN\nNaN\n\n\n1\n2023-09-19\n50\n25.0\nNaN\nNaN\n\n\n2\n2023-09-20\n49\n50.0\n25.0\nNaN\n\n\n3\n2023-09-21\n45\n49.0\n50.0\n25.0\n\n\n4\n2023-09-22\n48\n45.0\n49.0\n50.0\n\n\n5\n2023-09-23\n18\n48.0\n45.0\n49.0\n\n\n6\n2023-09-24\n18\n18.0\n48.0\n45.0\n\n\n\n\n\n\n\n\n\n\n\nCode\n# multiple leads values for a single time series\ndf \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = (1, 3)\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lead_1\nvalue_lead_2\nvalue_lead_3\n\n\n\n\n0\n2023-09-18\n25\n50.0\n49.0\n45.0\n\n\n1\n2023-09-19\n50\n49.0\n45.0\n48.0\n\n\n2\n2023-09-20\n49\n45.0\n48.0\n18.0\n\n\n3\n2023-09-21\n45\n48.0\n18.0\n18.0\n\n\n4\n2023-09-22\n48\n18.0\n18.0\nNaN\n\n\n5\n2023-09-23\n18\n18.0\nNaN\nNaN\n\n\n6\n2023-09-24\n18\nNaN\nNaN\nNaN" + "objectID": "tutorials/01_sales_crm.html#load-inspect-dataset", + "href": "tutorials/01_sales_crm.html#load-inspect-dataset", + "title": "Sales Analysis", + "section": "1.2 Load & inspect dataset", + "text": "1.2 Load & inspect dataset\nTo kick off our analysis, we’ll begin by importing essential libraries and accessing the ‘bike_sales’ dataset available within pytimetk’s suite of built-in datasets.\nThe Bike Sales dataset exemplifies what one might find in a CRM (Customer Relationship Management) system. CRM systems are pivotal for businesses, offering vital insights by tracking sales throughout the entire sales funnel. Such datasets are rich with transaction-level data, encompassing elements like order numbers, individual order lines, customer details, product information, and specific transaction data.\nTransactional data, such as this, inherently holds the essential components for time series analysis:\n\nTime Stamps\nAssociated Values\nDistinct Groups or Categories\n\nGiven these attributes, the Bike Sales dataset emerges as an ideal candidate for analysis using pytimetk." }, { - "objectID": "guides/05_augmenting.html#augment-lags-leads-for-grouped-time-series", - "href": "guides/05_augmenting.html#augment-lags-leads-for-grouped-time-series", - "title": "Adding Features (Augmenting)", - "section": "1.2 Augment Lags / Leads For Grouped Time Series", - "text": "1.2 Augment Lags / Leads For Grouped Time Series\naugment_lags() and augment_leads() also works for grouped time series data. Lets use the m4_daily_df dataset to showcase examples:\n\n\nCode\n# load m4_daily_df\nm4_daily_df = tk.load_dataset('m4_daily', parse_dates = ['date'])\n\n\n\nLagLead\n\n\n\n\nCode\n# agument lags for grouped time series\nm4_daily_df \\\n .groupby(\"id\") \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = (1, 7)\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_lag_1\nvalue_lag_2\nvalue_lag_3\nvalue_lag_4\nvalue_lag_5\nvalue_lag_6\nvalue_lag_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n4\nD10\n2014-07-07\n2006.4\n2048.9\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n9265.4\n9091.4\n\n\n9739\nD500\n2012-09-20\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n9265.4\n\n\n9740\nD500\n2012-09-21\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n\n\n9741\nD500\n2012-09-22\n9497.9\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n\n\n9742\nD500\n2012-09-23\n9545.3\n9497.9\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n\n\n\n\n9743 rows × 10 columns\n\n\n\n\n\n\n\nCode\n# augment leads for grouped time series\nm4_daily_df \\\n .groupby(\"id\") \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = (1, 7)\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_lead_1\nvalue_lead_2\nvalue_lead_3\nvalue_lead_4\nvalue_lead_5\nvalue_lead_6\nvalue_lead_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2073.4\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n\n\n1\nD10\n2014-07-04\n2073.4\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n\n\n3\nD10\n2014-07-06\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n1978.8\n\n\n4\nD10\n2014-07-07\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n1978.8\n1988.3\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9365.7\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\n\n\n9739\nD500\n2012-09-20\n9365.7\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\n\n\n9740\nD500\n2012-09-21\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n9741\nD500\n2012-09-22\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n9742\nD500\n2012-09-23\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n9743 rows × 10 columns" + "objectID": "tutorials/01_sales_crm.html#initial-inspection-with-tk.glimpse", + "href": "tutorials/01_sales_crm.html#initial-inspection-with-tk.glimpse", + "title": "Sales Analysis", + "section": "2.1 Initial Inspection with tk.glimpse", + "text": "2.1 Initial Inspection with tk.glimpse\nTo get a preliminary understanding of our data, let’s utilize the tk.glimpse() function from pytimetk. This will provide us with a snapshot of the available fields, their respective data types, and a sneak peek into the data entries.\n\n\nCode\ndf = tk.datasets.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 2466 rows of 13 columns\norder_id: int64 [1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, ...\norder_line: int64 [1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, ...\norder_date: datetime64[ns] [Timestamp('2011-01-07 00:00:00'), Ti ...\nquantity: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...\nprice: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\ntotal_price: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\nmodel: object ['Jekyll Carbon 2', 'Trigger Carbon 2 ...\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\ncategory_2: object ['Over Mountain', 'Over Mountain', 'T ...\nframe_material: object ['Carbon', 'Carbon', 'Aluminum', 'Car ...\nbikeshop_name: object ['Ithaca Mountain Climbers', 'Ithaca ...\ncity: object ['Ithaca', 'Ithaca', 'Kansas City', ' ...\nstate: object ['NY', 'NY', 'KS', 'KS', 'KY', 'KY', ..." }, { - "objectID": "guides/05_augmenting.html#basic-examples-1", - "href": "guides/05_augmenting.html#basic-examples-1", - "title": "Adding Features (Augmenting)", - "section": "2.1 Basic Examples", - "text": "2.1 Basic Examples\nWe’ll continue with the use of our sample df created earlier:\n\n\nCode\n# window = 3 days, window function = mean\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean'\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\n41.333333\n\n\n3\n2023-09-21\n45\n48.000000\n\n\n4\n2023-09-22\n48\n47.333333\n\n\n5\n2023-09-23\n18\n37.000000\n\n\n6\n2023-09-24\n18\n28.000000\n\n\n\n\n\n\n\nIt is important to understand how the center parameter in augment_rolling() works.\n\n\n\n\n\n\ncenter\n\n\n\n\n\nWhen set to True (default) the value of the rolling window will be centered, meaning that the value at the center of the window will be used as the result. When set to False (default) the rolling window will not be centered, meaning that the value at the end of the window will be used as the result.\n\n\n\nLets see an example:\n\nAugment Rolling: Center = TrueAugment Rolling: Center = False\n\n\n\n\nCode\n# agument rolling: center = true\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean',\n center = True\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\n41.333333\n\n\n2\n2023-09-20\n49\n48.000000\n\n\n3\n2023-09-21\n45\n47.333333\n\n\n4\n2023-09-22\n48\n37.000000\n\n\n5\n2023-09-23\n18\n28.000000\n\n\n6\n2023-09-24\n18\nNaN\n\n\n\n\n\n\n\nNote that we are using a 3 day rolling window and applying a mean to value. In simplier terms, value_rolling_mean_win_3 is a 3 day rolling average of value with center set to True. Thus the function starts computing the mean from 2023-09-19\n\n\n\n\nCode\n# agument rolling: center = false\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean',\n center = False\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\n41.333333\n\n\n3\n2023-09-21\n45\n48.000000\n\n\n4\n2023-09-22\n48\n47.333333\n\n\n5\n2023-09-23\n18\n37.000000\n\n\n6\n2023-09-24\n18\n28.000000\n\n\n\n\n\n\n\nNote that we are using a 3 day rolling window and applying a mean to value. In simplier terms, value_rolling_mean_win_3 is a 3 day rolling average of value with center set to False. Thus the function starts computing the mean from 2023-09-20. The same value for 2023-19-18 and 2023-09-19 are returned as value_rolling_mean_win_3 since it did not detected the third to apply the 3 day rolling average." + "objectID": "tutorials/01_sales_crm.html#data-exploration-with-tk.summarize_by_time", + "href": "tutorials/01_sales_crm.html#data-exploration-with-tk.summarize_by_time", + "title": "Sales Analysis", + "section": "2.2 Data Exploration with tk.summarize_by_time", + "text": "2.2 Data Exploration with tk.summarize_by_time\nCRM data is often bustling with activity, reflecting the myriad of transactions happening daily. Due to this high volume, the data can sometimes seem overwhelming or noisy. To derive meaningful insights, it’s essential to aggregate this data over specific time intervals. This is where tk.summarize_by_time() comes into play.\nThe tk.summarize_by_time() function offers a streamlined approach to time-based data aggregation. By defining a desired frequency and an aggregation method, this function seamlessly organizes your data. The beauty of it is its versatility; from a broad array of built-in aggregation methods and frequencies to the flexibility of integrating a custom function, it caters to a range of requirements.\n\n\n\n\n\n\nGetting to know tk.summarize_by_time()\n\n\n\n\n\nCurious about the various options it provides?\n\nClick here to see our Data Wrangling Guide\nUse help(tk.summarize_by_time) to review additional helpful documentation. And explore the plethora of possibilities!\n\n\n\n\n\nGetting Weekly Totals\nWe can quickly get totals by week with summarize_byt_time.\n\n\nCode\nweekly_totals = df.summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W'\n)\n\nweekly_totals.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\n\n\n\n\n0\n2011-01-09\n12040\n\n\n1\n2011-01-16\n151460\n\n\n2\n2011-01-23\n143850\n\n\n3\n2011-01-30\n175665\n\n\n4\n2011-02-06\n105210\n\n\n5\n2011-02-13\n250390\n\n\n6\n2011-02-20\n410595\n\n\n7\n2011-02-27\n254045\n\n\n8\n2011-03-06\n308420\n\n\n9\n2011-03-13\n45450\n\n\n\n\n\n\n\n\n\nGet Weekly Totals by Group (Category 2)\nTo better understand your data, you might want to add groups to this summary. We can include a groupby before the summarize_by_time and then aggregate our data.\n\n\nCode\n sales_by_week = df \\\n .groupby('category_2') \\\n .summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W'\n )\n\nsales_by_week.head(10)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-16\n61750\n\n\n1\nCross Country Race\n2011-01-23\n25050\n\n\n2\nCross Country Race\n2011-01-30\n56860\n\n\n3\nCross Country Race\n2011-02-06\n8740\n\n\n4\nCross Country Race\n2011-02-13\n78070\n\n\n5\nCross Country Race\n2011-02-20\n115010\n\n\n6\nCross Country Race\n2011-02-27\n64290\n\n\n7\nCross Country Race\n2011-03-06\n95070\n\n\n8\nCross Country Race\n2011-03-13\n3200\n\n\n9\nCross Country Race\n2011-03-20\n21170\n\n\n\n\n\n\n\n\n\nLong vs Wide Format\nThis long format can make it a little hard to compare the different group values visually, so instead of long-format you might want to pivot wide to view the data.\n\n\nCode\nsales_by_week_wide = df \\\n .groupby('category_2') \\\n .summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W',\n wide_format = True\n )\n\nsales_by_week_wide.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum_Cross Country Race\ntotal_price_sum_Cyclocross\ntotal_price_sum_Elite Road\ntotal_price_sum_Endurance Road\ntotal_price_sum_Fat Bike\ntotal_price_sum_Over Mountain\ntotal_price_sum_Sport\ntotal_price_sum_Trail\ntotal_price_sum_Triathalon\n\n\n\n\n0\n2011-01-09\n0.0\n0.0\n0.0\n0.0\n0.0\n12040.0\n0.0\n0.0\n0.0\n\n\n1\n2011-01-16\n61750.0\n1960.0\n49540.0\n11110.0\n0.0\n9170.0\n4030.0\n7450.0\n6450.0\n\n\n2\n2011-01-23\n25050.0\n3500.0\n51330.0\n47930.0\n0.0\n3840.0\n0.0\n0.0\n12200.0\n\n\n3\n2011-01-30\n56860.0\n2450.0\n43895.0\n24160.0\n0.0\n10880.0\n3720.0\n26700.0\n7000.0\n\n\n4\n2011-02-06\n8740.0\n7000.0\n35640.0\n22680.0\n3730.0\n14270.0\n980.0\n10220.0\n1950.0\n\n\n5\n2011-02-13\n78070.0\n0.0\n83780.0\n24820.0\n2130.0\n17160.0\n6810.0\n17120.0\n20500.0\n\n\n6\n2011-02-20\n115010.0\n7910.0\n79770.0\n27650.0\n26100.0\n37830.0\n10925.0\n96250.0\n9150.0\n\n\n7\n2011-02-27\n64290.0\n6650.0\n86900.0\n31900.0\n5860.0\n22070.0\n6165.0\n16410.0\n13800.0\n\n\n8\n2011-03-06\n95070.0\n2450.0\n31990.0\n47660.0\n5860.0\n82060.0\n9340.0\n26790.0\n7200.0\n\n\n9\n2011-03-13\n3200.0\n4200.0\n23110.0\n7260.0\n0.0\n5970.0\n1710.0\n0.0\n0.0\n\n\n\n\n\n\n\nYou can now observe the total sales for each product side by side. This streamlined view facilitates easy comparison between product sales." }, { - "objectID": "guides/05_augmenting.html#augment-rolling-with-multiple-windows-and-window-functions", - "href": "guides/05_augmenting.html#augment-rolling-with-multiple-windows-and-window-functions", - "title": "Adding Features (Augmenting)", - "section": "2.2 Augment Rolling with Multiple Windows and Window Functions", - "text": "2.2 Augment Rolling with Multiple Windows and Window Functions\nMultiple window functions can be passed to the window and window_func parameters:\n\n\nCode\n# augment rolling: window of 2 & 7 days, window_func of mean and standard deviation\nm4_daily_df \\\n .query('id == \"D10\"') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = [2,7],\n window_func = ['mean', ('std', lambda x: x.std())]\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n669\nD10\n2016-05-02\n2630.7\n2615.85\n14.85\n2579.471429\n28.868159\n\n\n670\nD10\n2016-05-03\n2649.3\n2640.00\n9.30\n2594.800000\n33.081631\n\n\n671\nD10\n2016-05-04\n2631.8\n2640.55\n8.75\n2601.371429\n35.145563\n\n\n672\nD10\n2016-05-05\n2622.5\n2627.15\n4.65\n2607.457143\n34.584508\n\n\n673\nD10\n2016-05-06\n2620.1\n2621.30\n1.20\n2618.328571\n22.923270\n\n\n\n\n674 rows × 7 columns" + "objectID": "tutorials/01_sales_crm.html#visualize-your-time-series-data-with-tk.plot_timeseries", + "href": "tutorials/01_sales_crm.html#visualize-your-time-series-data-with-tk.plot_timeseries", + "title": "Sales Analysis", + "section": "2.3 Visualize your time series data with tk.plot_timeseries", + "text": "2.3 Visualize your time series data with tk.plot_timeseries\nYou can now visualize the summarized data to gain a clearer insight into the prevailing trends.\n\nPlotlyPlotnine\n\n\n\n\nCode\nsales_by_week \\\n .groupby('category_2') \\\n .plot_timeseries(\n date_column = 'order_date', \n value_column = 'total_price_sum',\n title = 'Bike Sales by Category',\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n y_lab = 'Total Sales', \n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nsales_by_week \\\n .groupby('category_2') \\\n .plot_timeseries(\n date_column = 'order_date', \n value_column = 'total_price_sum',\n title = 'Bike Sales by Category',\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n y_lab = 'Total Sales', \n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 800)>\n\n\n\n\n\nThe graph showcases a pronounced uptick in sales for most of the different bike products during the summer. It’s a natural trend, aligning with our understanding that people gravitate towards biking during the balmy summer days. Conversely, as the chill of winter sets in at the year’s start and end, we observe a corresponding dip in sales.\nIt’s worth highlighting the elegance of the plot_timeseries function. Beyond just plotting raw data, it introduces a smoother, accentuating underlying trends and making them more discernible. This enhancement ensures we can effortlessly capture and comprehend the cyclical nature of bike sales throughout the year." }, { - "objectID": "guides/05_augmenting.html#augment-rolling-with-grouped-time-series", - "href": "guides/05_augmenting.html#augment-rolling-with-grouped-time-series", - "title": "Adding Features (Augmenting)", - "section": "2.3 Augment Rolling with Grouped Time Series", - "text": "2.3 Augment Rolling with Grouped Time Series\nagument_rolling can be used on grouped time series data:\n\n\nCode\n## augment rolling on grouped time series: window of 2 & 7 days, window_func of mean and standard deviation\nm4_daily_df \\\n .groupby('id') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = [2,7],\n window_func = ['mean', ('std', lambda x: x.std())]\n )\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9425.35\n6.55\n9382.071429\n74.335988\n\n\n9739\nD500\n2012-09-20\n9365.7\n9392.25\n26.55\n9396.400000\n58.431303\n\n\n9740\nD500\n2012-09-21\n9445.9\n9405.80\n40.10\n9419.114286\n39.184451\n\n\n9741\nD500\n2012-09-22\n9497.9\n9471.90\n26.00\n9438.928571\n38.945336\n\n\n9742\nD500\n2012-09-23\n9545.3\n9521.60\n23.70\n9449.028571\n53.379416\n\n\n\n\n9743 rows × 7 columns" + "objectID": "tutorials/01_sales_crm.html#making-irregular-data-regular-with-tk.pad_by_time", + "href": "tutorials/01_sales_crm.html#making-irregular-data-regular-with-tk.pad_by_time", + "title": "Sales Analysis", + "section": "3.1 Making irregular data regular with tk.pad_by_time", + "text": "3.1 Making irregular data regular with tk.pad_by_time\nKicking off our journey, we’ll utilize pytimetk’s tk.pad_by_time() function. For this, grouping by the ‘category_1’ variable is recommended. Moreover, it’s prudent to establish a definitive end date. This ensures that all groups are equipped with training data up to the most recent date, accommodating scenarios where certain categories might have seen no sales in the final training week. By doing so, we create a representative observation for every group, capturing the nuances of each category’s sales pattern.\n\n\nCode\nsales_padded = sales_by_week \\\n .groupby('category_2') \\\n .pad_by_time(\n date_column = 'order_date',\n freq = 'W',\n end_date = sales_by_week.order_date.max()\n )\nsales_padded\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\n\n\n...\n...\n...\n...\n\n\n463\nTriathalon\n2011-12-04\n3200.0\n\n\n464\nTriathalon\n2011-12-11\n28350.0\n\n\n465\nTriathalon\n2011-12-18\n2700.0\n\n\n466\nTriathalon\n2011-12-25\n3900.0\n\n\n467\nTriathalon\n2012-01-01\nNaN\n\n\n\n\n468 rows × 3 columns" }, { - "objectID": "guides/05_augmenting.html#basic-example", - "href": "guides/05_augmenting.html#basic-example", - "title": "Adding Features (Augmenting)", - "section": "3.1 Basic Example", - "text": "3.1 Basic Example\nWe’ll showcase an example using the m4_daily_df dataset by generating 29 additional features from the date column:\n\n\nCode\n# augment time series signature\nm4_daily_df \\\n .query('id == \"D10\"') \\\n .augment_timeseries_signature(\n date_column = 'date'\n ) \\\n .head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows × 32 columns" + "objectID": "tutorials/01_sales_crm.html#making-future-dates-easier-with-tk.future_frame", + "href": "tutorials/01_sales_crm.html#making-future-dates-easier-with-tk.future_frame", + "title": "Sales Analysis", + "section": "3.2 Making Future Dates Easier with tk.future_frame", + "text": "3.2 Making Future Dates Easier with tk.future_frame\nMoving on, let’s set up the future frame, which will serve as our test dataset. To achieve this, employ the tk.future_frame() method. This function allows for the specification of a grouping column and a forecast horizon.\nUpon invoking tk.future_frame(), you’ll observe that placeholders (null values) are added for each group, extending 12 weeks into the future.\n\n\nCode\ndf_with_futureframe = sales_padded \\\n .groupby('category_2') \\\n .future_frame(\n date_column = 'order_date',\n length_out = 12\n )\ndf_with_futureframe\n\n\n\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\n\n\n...\n...\n...\n...\n\n\n571\nTriathalon\n2012-02-26\nNaN\n\n\n572\nTriathalon\n2012-03-04\nNaN\n\n\n573\nTriathalon\n2012-03-11\nNaN\n\n\n574\nTriathalon\n2012-03-18\nNaN\n\n\n575\nTriathalon\n2012-03-25\nNaN\n\n\n\n\n576 rows × 3 columns" }, { - "objectID": "guides/05_augmenting.html#basic-example-1", - "href": "guides/05_augmenting.html#basic-example-1", - "title": "Adding Features (Augmenting)", - "section": "4.1 Basic Example", - "text": "4.1 Basic Example\nWe’ll showcase an example using some sample data:\n\n\nCode\n# create sample data\ndates = pd.date_range(start = '2022-12-25', end = '2023-01-05')\n\ndf = pd.DataFrame({'date': dates})\n\n# augment time series signature: USA\ndf \\\n .augment_holiday_signature(\n date_column = 'date',\n country_name = 'UnitedStates'\n )\n\n\n\n\n\n\n\n\n\ndate\nis_holiday\nbefore_holiday\nafter_holiday\nholiday_name\n\n\n\n\n0\n2022-12-25\n1\n1\n0\nChristmas Day\n\n\n1\n2022-12-26\n1\n0\n1\nChristmas Day (Observed)\n\n\n2\n2022-12-27\n0\n0\n1\nNaN\n\n\n3\n2022-12-28\n0\n0\n0\nNaN\n\n\n4\n2022-12-29\n0\n0\n0\nNaN\n\n\n5\n2022-12-30\n0\n0\n0\nNaN\n\n\n6\n2022-12-31\n0\n1\n0\nNaN\n\n\n7\n2023-01-01\n1\n1\n0\nNew Year's Day\n\n\n8\n2023-01-02\n1\n0\n1\nNew Year's Day (Observed)\n\n\n9\n2023-01-03\n0\n0\n1\nNaN\n\n\n10\n2023-01-04\n0\n0\n0\nNaN\n\n\n11\n2023-01-05\n0\n0\n0\nNaN" + "objectID": "tutorials/01_sales_crm.html#lag-values-with-tk.augment_lags", + "href": "tutorials/01_sales_crm.html#lag-values-with-tk.augment_lags", + "title": "Sales Analysis", + "section": "3.3 Lag Values with tk.augment_lags", + "text": "3.3 Lag Values with tk.augment_lags\nCrafting features from time series data can be intricate, but thanks to the suite of feature engineering tools in pytimetk, the process is streamlined and intuitive.\nIn this guide, we’ll focus on the basics: introducing a few lag variables and incorporating some date-related features.\nFirstly, let’s dive into creating lag features.\nGiven our forecasting objective of a 12-week horizon, to ensure we have lag data available for every future point, we should utilize a lag of 12 or more. The beauty of the toolkit is that it supports the addition of multiple lags simultaneously.\nLag features play a pivotal role in machine learning for time series. Often, recent data offers valuable insights into future trends. To capture this recency effect, it’s crucial to integrate lag values. For this purpose, tk.augment_lags() comes in handy.\n\n\nCode\ndf_with_lags = df_with_futureframe \\\n .groupby('category_2') \\\n .augment_lags(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n lags = [12,24]\n\n )\ndf_with_lags.head(25)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\nNaN\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\nNaN\nNaN\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\nNaN\nNaN\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\nNaN\nNaN\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\nNaN\nNaN\n\n\n5\nCross Country Race\n2011-02-13\n78070.0\nNaN\nNaN\n\n\n6\nCross Country Race\n2011-02-20\n115010.0\nNaN\nNaN\n\n\n7\nCross Country Race\n2011-02-27\n64290.0\nNaN\nNaN\n\n\n8\nCross Country Race\n2011-03-06\n95070.0\nNaN\nNaN\n\n\n9\nCross Country Race\n2011-03-13\n3200.0\nNaN\nNaN\n\n\n10\nCross Country Race\n2011-03-20\n21170.0\nNaN\nNaN\n\n\n11\nCross Country Race\n2011-03-27\n28990.0\nNaN\nNaN\n\n\n12\nCross Country Race\n2011-04-03\n51860.0\nNaN\nNaN\n\n\n13\nCross Country Race\n2011-04-10\n85910.0\n61750.0\nNaN\n\n\n14\nCross Country Race\n2011-04-17\n138230.0\n25050.0\nNaN\n\n\n15\nCross Country Race\n2011-04-24\n138350.0\n56860.0\nNaN\n\n\n16\nCross Country Race\n2011-05-01\n136090.0\n8740.0\nNaN\n\n\n17\nCross Country Race\n2011-05-08\n32110.0\n78070.0\nNaN\n\n\n18\nCross Country Race\n2011-05-15\n139010.0\n115010.0\nNaN\n\n\n19\nCross Country Race\n2011-05-22\n2060.0\n64290.0\nNaN\n\n\n20\nCross Country Race\n2011-05-29\n26130.0\n95070.0\nNaN\n\n\n21\nCross Country Race\n2011-06-05\n30360.0\n3200.0\nNaN\n\n\n22\nCross Country Race\n2011-06-12\n88280.0\n21170.0\nNaN\n\n\n23\nCross Country Race\n2011-06-19\n109470.0\n28990.0\nNaN\n\n\n24\nCross Country Race\n2011-06-26\n107280.0\n51860.0\nNaN\n\n\n\n\n\n\n\nObserve that lag values of 12 and 24 introduce missing entries at the dataset’s outset. This occurs because there isn’t available data from 12 or 24 weeks prior. To address these gaps, you can adopt one of two strategies:\n\nDiscard the Affected Rows: This is a recommended approach if your dataset is sufficiently large. Removing a few initial rows might not significantly impact the training process.\nBackfill Missing Values: In situations with limited data, you might consider backfilling these nulls using the first available values from lag 12 and 24. However, the appropriateness of this technique hinges on your specific context and objectives.\n\nFor the scope of this tutorial, we’ll opt to remove these rows. However, it’s worth pointing out that our dataset is quite small with limited historical data, so this might impact our model.\n\n\nCode\nlag_columns = [col for col in df_with_lags.columns if 'lag' in col]\ndf_no_nas = df_with_lags \\\n .dropna(subset=lag_columns, inplace=False)\n\ndf_no_nas.head()\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\n\n\n\n\n25\nCross Country Race\n2011-07-03\n56430.0\n85910.0\n61750.0\n\n\n26\nCross Country Race\n2011-07-10\n62320.0\n138230.0\n25050.0\n\n\n27\nCross Country Race\n2011-07-17\n141620.0\n138350.0\n56860.0\n\n\n28\nCross Country Race\n2011-07-24\n75720.0\n136090.0\n8740.0\n\n\n29\nCross Country Race\n2011-07-31\n21240.0\n32110.0\n78070.0" }, { - "objectID": "guides/05_augmenting.html#basic-example-2", - "href": "guides/05_augmenting.html#basic-example-2", - "title": "Adding Features (Augmenting)", - "section": "5.1 Basic Example", - "text": "5.1 Basic Example\n\n\nCode\n# augment fourier with 7 periods and max order of 1\n#m4_daily_df \\\n# .query('id == \"D10\"') \\\n# .augment_fourier(\n# date_column = 'date',\n# value_column = 'value',\n# num_periods = 7,\n# max_order = 1\n# ) \\\n# .head(20)\n\n\nNotice the additional value_fourier_1_1 to value_fourier_1_7 colums that have been added to the data." + "objectID": "tutorials/01_sales_crm.html#date-features-with-tk.augment_timeseries_signature", + "href": "tutorials/01_sales_crm.html#date-features-with-tk.augment_timeseries_signature", + "title": "Sales Analysis", + "section": "3.4 Date Features with tk.augment_timeseries_signature", + "text": "3.4 Date Features with tk.augment_timeseries_signature\nNow, let’s enrich our dataset with date-related features.\nWith the function tk.augment_timeseries_signature(), you can effortlessly append 29 date attributes to a timestamp. Given that our dataset captures weekly intervals, certain attributes like ‘hour’ may not be pertinent. Thus, it’s prudent to refine our columns, retaining only those that truly matter to our analysis.\n\n\nCode\ndf_with_datefeatures = df_no_nas \\\n .augment_timeseries_signature(date_column='order_date')\n\ndf_with_datefeatures.head(10)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_index_num\norder_date_year\norder_date_year_iso\norder_date_yearstart\norder_date_yearend\n...\norder_date_mday\norder_date_qday\norder_date_yday\norder_date_weekend\norder_date_hour\norder_date_minute\norder_date_second\norder_date_msecond\norder_date_nsecond\norder_date_am_pm\n\n\n\n\n25\nCross Country Race\n2011-07-03\n56430.0\n85910.0\n61750.0\n1309651200\n2011\n2011\n0\n0\n...\n3\n3\n184\n1\n0\n0\n0\n0\n0\nam\n\n\n26\nCross Country Race\n2011-07-10\n62320.0\n138230.0\n25050.0\n1310256000\n2011\n2011\n0\n0\n...\n10\n10\n191\n1\n0\n0\n0\n0\n0\nam\n\n\n27\nCross Country Race\n2011-07-17\n141620.0\n138350.0\n56860.0\n1310860800\n2011\n2011\n0\n0\n...\n17\n17\n198\n1\n0\n0\n0\n0\n0\nam\n\n\n28\nCross Country Race\n2011-07-24\n75720.0\n136090.0\n8740.0\n1311465600\n2011\n2011\n0\n0\n...\n24\n24\n205\n1\n0\n0\n0\n0\n0\nam\n\n\n29\nCross Country Race\n2011-07-31\n21240.0\n32110.0\n78070.0\n1312070400\n2011\n2011\n0\n0\n...\n31\n31\n212\n1\n0\n0\n0\n0\n0\nam\n\n\n30\nCross Country Race\n2011-08-07\n11620.0\n139010.0\n115010.0\n1312675200\n2011\n2011\n0\n0\n...\n7\n38\n219\n1\n0\n0\n0\n0\n0\nam\n\n\n31\nCross Country Race\n2011-08-14\n9730.0\n2060.0\n64290.0\n1313280000\n2011\n2011\n0\n0\n...\n14\n45\n226\n1\n0\n0\n0\n0\n0\nam\n\n\n32\nCross Country Race\n2011-08-21\n22780.0\n26130.0\n95070.0\n1313884800\n2011\n2011\n0\n0\n...\n21\n52\n233\n1\n0\n0\n0\n0\n0\nam\n\n\n33\nCross Country Race\n2011-08-28\n53680.0\n30360.0\n3200.0\n1314489600\n2011\n2011\n0\n0\n...\n28\n59\n240\n1\n0\n0\n0\n0\n0\nam\n\n\n34\nCross Country Race\n2011-09-04\n38360.0\n88280.0\n21170.0\n1315094400\n2011\n2011\n0\n0\n...\n4\n66\n247\n1\n0\n0\n0\n0\n0\nam\n\n\n\n\n10 rows × 34 columns\n\n\n\nWe can quickly get a sense of what features were just created using tk.glimpse.\n\n\nCode\ndf_with_datefeatures.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 34 columns\ncategory_2: object ['Cross Country Race', 'Cros ...\norder_date: datetime64[ns] [Timestamp('2011-07-03 00:00 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141620.0, ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 138350.0 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 56860.0, ...\norder_date_index_num: int64 [1309651200, 1310256000, 131 ...\norder_date_year: int64 [2011, 2011, 2011, 2011, 201 ...\norder_date_year_iso: UInt32 [2011, 2011, 2011, 2011, 201 ...\norder_date_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_leapyear: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, 3, 3, ...\norder_date_quarteryear: object ['2011Q3', '2011Q3', '2011Q3 ...\norder_date_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, 8, 8, ...\norder_date_month_lbl: object ['July', 'July', 'July', 'Ju ...\norder_date_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_monthend: uint8 [0, 0, 0, 0, 1, 0, 0, 0, 0, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 31, 32, ...\norder_date_mweek: int64 [1, 2, 3, 4, 5, 1, 2, 3, 4, ...\norder_date_wday: int64 [7, 7, 7, 7, 7, 7, 7, 7, 7, ...\norder_date_wday_lbl: object ['Sunday', 'Sunday', 'Sunday ...\norder_date_mday: int64 [3, 10, 17, 24, 31, 7, 14, 2 ...\norder_date_qday: int64 [3, 10, 17, 24, 31, 38, 45, ...\norder_date_yday: int64 [184, 191, 198, 205, 212, 21 ...\norder_date_weekend: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, ...\norder_date_hour: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_minute: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_second: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_msecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_nsecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_am_pm: object ['am', 'am', 'am', 'am', 'am ...\n\n\nLet’s subset to just a few of the relevant date features. Let’s use tk.glimpse again.\n\n\nCode\ndf_with_datefeatures_narrom = df_with_datefeatures[[\n 'order_date', \n 'category_2', \n 'total_price_sum',\n 'total_price_sum_lag_12',\n 'total_price_sum_lag_24',\n 'order_date_year', \n 'order_date_half', \n 'order_date_quarter', \n 'order_date_month',\n 'order_date_yweek'\n]]\n\ndf_with_datefeatures_narrom.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 10 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 00:00: ...\ncategory_2: object ['Cross Country Race', 'Cross ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141620.0, ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 138350.0, ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 56860.0, 8 ...\norder_date_year: int64 [2011, 2011, 2011, 2011, 2011 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, 8, 8, 9 ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 31, 32, ...\n\n\n\nOne-Hot Encoding\nThe final phase in our feature engineering journey is one-hot encoding our categorical variables. While certain machine learning models like CatBoost can natively handle categorical data, many cannot. Enter one-hot encoding, a technique that transforms each category within a column into its separate column, marking its presence with a ‘1’ or absence with a ‘0’.\nFor this transformation, the handy pd.get_dummies() function from pandas comes to the rescue.\n\n\nCode\ndf_encoded = pd.get_dummies(df_with_datefeatures_narrom, columns=['category_2'])\n\ndf_encoded.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 18 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141 ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 13 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 568 ...\norder_date_year: int64 [2011, 2011, 2011, 201 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 3 ...\ncategory_2_Cross Country Race: uint8 [1, 1, 1, 1, 1, 1, 1, ...\ncategory_2_Cyclocross: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Elite Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Endurance Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Fat Bike: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Over Mountain: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Sport: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Trail: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Triathalon: uint8 [0, 0, 0, 0, 0, 0, 0, ...\n\n\n\n\nTraining and Future Feature Sets\nPytimetk offers an extensive array of feature engineering tools and augmentation functions, giving you a broad spectrum of possibilities. However, for the purposes of this tutorial, let’s shift our focus to modeling.\nLet’s proceed by segmenting our dataframe into training and future sets.\n\n\nCode\nfuture = df_encoded[df_encoded.total_price_sum.isnull()]\ntrain = df_encoded[df_encoded.total_price_sum.notnull()]\n\n\nLet’s focus on the columns essential for training. You’ll observe that we’ve excluded the ‘order_date’ column. This is because numerous machine learning models struggle with date data types. This is precisely why we utilized the tk.augment_timeseries_signature earlier—to transform date features into a format that’s compatible with ML models.\nWe can quickly see what features we have available with tk.glimpse().\n\n\nCode\ntrain.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 233 rows of 18 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141 ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 13 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 568 ...\norder_date_year: int64 [2011, 2011, 2011, 201 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 3 ...\ncategory_2_Cross Country Race: uint8 [1, 1, 1, 1, 1, 1, 1, ...\ncategory_2_Cyclocross: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Elite Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Endurance Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Fat Bike: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Over Mountain: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Sport: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Trail: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Triathalon: uint8 [0, 0, 0, 0, 0, 0, 0, ..." }, { - "objectID": "guides/05_augmenting.html#augment-fourier-with-grouped-time-series", - "href": "guides/05_augmenting.html#augment-fourier-with-grouped-time-series", - "title": "Adding Features (Augmenting)", - "section": "5.2 Augment Fourier with Grouped Time Series", - "text": "5.2 Augment Fourier with Grouped Time Series\naugment_fourier also works with grouped time series:\n\n\nCode\n# augment fourier with grouped time series\nm4_daily_df \\\n .groupby('id') \\\n .augment_fourier(\n date_column = 'date',\n value_column = 'value',\n num_periods = 7,\n max_order = 1\n ) \\\n .head(20)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_fourier_1_1\nvalue_fourier_1_2\nvalue_fourier_1_3\nvalue_fourier_1_4\nvalue_fourier_1_5\nvalue_fourier_1_6\nvalue_fourier_1_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n0.394510\n-0.725024\n0.937927\n-0.998682\n0.897435\n-0.650609\n0.298243\n\n\n1\nD10\n2014-07-04\n2073.4\n-0.980653\n0.383931\n0.830342\n-0.709015\n-0.552759\n0.925423\n0.190450\n\n\n2\nD10\n2014-07-05\n2048.7\n0.011484\n0.022967\n0.034446\n0.045921\n0.057390\n0.068852\n0.080304\n\n\n3\nD10\n2014-07-06\n2048.9\n0.975899\n-0.425928\n-0.790004\n0.770723\n0.453624\n-0.968706\n-0.030835\n\n\n4\nD10\n2014-07-07\n2006.4\n-0.415510\n0.755886\n-0.959581\n0.989762\n-0.840972\n0.540115\n-0.141593\n\n\n5\nD10\n2014-07-08\n2017.6\n-0.803876\n-0.956286\n-0.333715\n0.559301\n0.999055\n0.629169\n-0.250600\n\n\n6\nD10\n2014-07-09\n2019.1\n0.748318\n0.992779\n0.568784\n-0.238184\n-0.884778\n-0.935635\n-0.356511\n\n\n7\nD10\n2014-07-10\n2007.4\n0.494070\n-0.859111\n0.999790\n-0.879368\n0.529294\n-0.040992\n-0.458015\n\n\n8\nD10\n2014-07-11\n2010.0\n-0.952864\n0.578192\n0.602021\n-0.943494\n-0.029515\n0.961404\n-0.553858\n\n\n9\nD10\n2014-07-12\n2001.5\n-0.099581\n-0.198171\n-0.294792\n-0.388482\n-0.478310\n-0.563384\n-0.642856\n\n\n10\nD10\n2014-07-13\n1978.8\n0.994091\n-0.215816\n-0.947238\n0.421459\n0.855740\n-0.607239\n-0.723909\n\n\n11\nD10\n2014-07-14\n1988.3\n-0.311977\n0.592812\n-0.814472\n0.954831\n-0.999879\n0.945118\n-0.796015\n\n\n12\nD10\n2014-07-15\n2000.7\n-0.864932\n-0.868201\n-0.006551\n0.861625\n0.871433\n0.013101\n-0.858282\n\n\n13\nD10\n2014-07-16\n2010.5\n0.670062\n0.994781\n0.806801\n0.203005\n-0.505418\n-0.953354\n-0.909941\n\n\n14\nD10\n2014-07-17\n2014.5\n0.587524\n-0.950856\n0.951356\n-0.588831\n0.001617\n0.586214\n-0.950354\n\n\n15\nD10\n2014-07-18\n1962.6\n-0.913299\n0.743956\n0.307286\n-0.994265\n0.502625\n0.584837\n-0.979022\n\n\n16\nD10\n2014-07-19\n1948.0\n-0.209415\n-0.409542\n-0.591509\n-0.747244\n-0.869842\n-0.953865\n-0.995589\n\n\n17\nD10\n2014-07-20\n1943.0\n0.999997\n0.004934\n-0.999973\n-0.009867\n0.999924\n0.014800\n-0.999851\n\n\n18\nD10\n2014-07-21\n1933.3\n-0.204588\n0.400521\n-0.579511\n0.733985\n-0.857409\n0.944561\n-0.991756\n\n\n19\nD10\n2014-07-22\n1891.0\n-0.915297\n-0.737326\n0.321336\n0.996182\n0.481148\n-0.608588\n-0.971403" + "objectID": "tutorials/01_sales_crm.html#scikit-learn-model", + "href": "tutorials/01_sales_crm.html#scikit-learn-model", + "title": "Sales Analysis", + "section": "3.5 Scikit Learn Model", + "text": "3.5 Scikit Learn Model\nNow for some machine learning.\n\nFitting a Random Forest Regressor\nLet’s create a RandomForestRegressor to predict future sales patterns.\n\ntrain_columns = [ 'total_price_sum_lag_12',\n 'total_price_sum_lag_24', 'order_date_year', 'order_date_half',\n 'order_date_quarter', 'order_date_month', 'order_date_yweek','category_2_Cross Country Race', 'category_2_Cyclocross',\n 'category_2_Elite Road', 'category_2_Endurance Road',\n 'category_2_Fat Bike', 'category_2_Over Mountain', 'category_2_Sport',\n 'category_2_Trail', 'category_2_Triathalon']\nX = train[train_columns]\ny = train[['total_price_sum']]\n\nmodel = RandomForestRegressor(random_state=123)\nmodel = model.fit(X, y)\n\n\n\nPrediction\nWe now have a fitted model, and can use this to predict sales from our future frame.\n\n\nCode\npredicted_values = model.predict(future[train_columns])\nfuture['y_pred'] = predicted_values\n\nfuture.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ncategory_2_Cross Country Race\ncategory_2_Cyclocross\ncategory_2_Elite Road\ncategory_2_Endurance Road\ncategory_2_Fat Bike\ncategory_2_Over Mountain\ncategory_2_Sport\ncategory_2_Trail\ncategory_2_Triathalon\ny_pred\n\n\n\n\n468\n2012-01-08\nNaN\n51820.0\n75720.0\n2012\n1\n1\n1\n1\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59462.00\n\n\n469\n2012-01-15\nNaN\n62940.0\n21240.0\n2012\n1\n1\n1\n2\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59149.45\n\n\n470\n2012-01-22\nNaN\n9060.0\n11620.0\n2012\n1\n1\n1\n3\n1\n0\n0\n0\n0\n0\n0\n0\n0\n20458.40\n\n\n471\n2012-01-29\nNaN\n15980.0\n9730.0\n2012\n1\n1\n1\n4\n1\n0\n0\n0\n0\n0\n0\n0\n0\n31914.00\n\n\n472\n2012-02-05\nNaN\n59180.0\n22780.0\n2012\n1\n1\n2\n5\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59128.95\n\n\n473\n2012-02-12\nNaN\n132550.0\n53680.0\n2012\n1\n1\n2\n6\n1\n0\n0\n0\n0\n0\n0\n0\n0\n76397.50\n\n\n474\n2012-02-19\nNaN\n68430.0\n38360.0\n2012\n1\n1\n2\n7\n1\n0\n0\n0\n0\n0\n0\n0\n0\n63497.80\n\n\n475\n2012-02-26\nNaN\n29470.0\n90290.0\n2012\n1\n1\n2\n8\n1\n0\n0\n0\n0\n0\n0\n0\n0\n57332.00\n\n\n476\n2012-03-04\nNaN\n71080.0\n7380.0\n2012\n1\n1\n3\n9\n1\n0\n0\n0\n0\n0\n0\n0\n0\n60981.30\n\n\n477\n2012-03-11\nNaN\n9800.0\n0.0\n2012\n1\n1\n3\n10\n1\n0\n0\n0\n0\n0\n0\n0\n0\n18738.15\n\n\n\n\n\n\n\n\n\nCleaning Up\nNow let us do a little cleanup. For ease in plotting later, let’s add a column to track the actuals vs. the predicted values.\n\n\nCode\ntrain['type'] = 'actuals'\nfuture['type'] = 'prediction'\n\nfull_df = pd.concat([train, future])\n\nfull_df.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ncategory_2_Cross Country Race\ncategory_2_Cyclocross\ncategory_2_Elite Road\ncategory_2_Endurance Road\ncategory_2_Fat Bike\ncategory_2_Over Mountain\ncategory_2_Sport\ncategory_2_Trail\ncategory_2_Triathalon\ntype\ny_pred\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n\n\n\n\n\nYou can get the grouping category back from the one-hot encoding for easier plotting. For simplicity, we will search for any column with ‘category’ in its name.\n\n\nCode\n# Extract dummy columns\ndummy_cols = [col for col in full_df.columns if 'category' in col.lower() ]\nfull_df_reverted = full_df.copy()\n\n# Convert dummy columns back to categorical column\nfull_df_reverted['category'] = full_df_reverted[dummy_cols].idxmax(axis=1).str.replace(\"A_\", \"\")\n\n# Drop dummy columns\nfull_df_reverted = full_df_reverted.drop(columns=dummy_cols)\n\nfull_df_reverted.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ntype\ny_pred\ncategory\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n\n\n\n\n\n\n\nPre-Visualization Wrangling\nBefore we proceed to visualization, let’s streamline our dataset by aligning our predicted values with the actuals. This approach will simplify the plotting process. Given that our DataFrame columns are already labeled as ‘actuals’ and ‘predictions’, a brief conditional check will allow us to consolidate the necessary values.\n\n\nCode\nfull_df_reverted['total_price_sum'] = np.where(full_df_reverted.type =='actuals', full_df_reverted.total_price_sum, full_df_reverted.y_pred)\n\nfull_df_reverted.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ntype\ny_pred\ncategory\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n\n\n\n\n\n\n\nVisualize the Forecast\nLet’s again use tk.plot_timeseries() to visually inspect the forecasts.\n\nPlotlyPlotnine\n\n\n\n\nCode\nfull_df_reverted \\\n .groupby('category') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nfull_df_reverted \\\n .groupby('category') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2, \n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 800)>\n\n\n\n\n\nUpon examining the graph, our models look alright given the length of time for training. Important points:\n\nFor effective time series forecasting, having multiple years of data is pivotal. This provides the model ample opportunities to recognize and adapt to seasonal variations.\nGiven our dataset spanned less than a year, the model lacked the depth of historical context to discern such patterns.\nAlthough our feature engineering was kept basic to introduce various pytimetk capabilities, there’s room for enhancement.\nFor a more refined analysis, consider experimenting with different machine learning models and diving deeper into feature engineering.\nPytimetk’s tk.augment_fourier() might assist in discerning seasonal trends, but with the dataset’s limited historical scope, capturing intricate patterns could remain a challenge." }, { - "objectID": "guides/06_anomalize.html", - "href": "guides/06_anomalize.html", - "title": "Anomaly Detection", + "objectID": "tutorials/05_clustering.html", + "href": "tutorials/05_clustering.html", + "title": "Clustering", "section": "", - "text": "Anomaly detection in time series analysis is a crucial process for identifying unusual patterns that deviate from expected behavior. These anomalies can signify critical, often unforeseen events in time series data. Effective anomaly detection helps in maintaining the quality and reliability of data, ensuring accurate forecasting and decision-making. The challenge lies in distinguishing between true anomalies and natural fluctuations, which demands sophisticated analytical techniques and a deep understanding of the underlying time series patterns. As a result, anomaly detection is an essential component of time series analysis, driving the proactive management of risks and opportunities in dynamic environments.\nPytimetk uses the following methods to determine anomalies in time series data;\nThere are 2 common techniques for seasonal decomposition; STL and Twitter;" + "text": "Coming soon…\n\n1 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." }, { - "objectID": "guides/06_anomalize.html#setup", - "href": "guides/06_anomalize.html#setup", - "title": "Anomaly Detection", - "section": "1.1 Setup", - "text": "1.1 Setup\nTo setup, import the necessary packages and the m4_daily_df dataset;\n\n# libraries\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Import Data\nm4_daily_df = tk.load_dataset('m4_daily', parse_dates = ['date'])\n\nLet’s first demonstrate with a single time series. We’ll filter m4_daily_df for id = D10 and date within the year 2015.\n\n# Data filtering\ndf = (\n m4_daily_df\n .query(\"id == 'D10'\")\n .query(\"date.dt.year == 2015\")\n)\n\nWe can plot this data to see the trend\n\n# Plot data\ntk.plot_timeseries(\n data = df,\n date_column = 'date',\n value_column = 'value'\n)" + "objectID": "tutorials/04_anomaly_detection.html", + "href": "tutorials/04_anomaly_detection.html", + "title": "Anomaly Detection in Website Traffic", + "section": "", + "text": "Anomalize: Breakdown, identify, and clean anomalies in 1 easy step\nAnomalies, often called outliers, are data points that deviate significantly from the general trend or pattern in the data. In the context of time series, they can appear as sudden spikes, drops, or any abrupt change in a sequence of values.\nAnomaly detection for time series is a technique used to identify unusual patterns that do not conform to expected behavior. It is especially relevant for sequential data (like stock prices, sensor data, sales data, etc.) where the temporal aspect is crucial. Anomalies can identify important events or be the cause of noise that can hinder forecasting performance." }, { - "objectID": "guides/06_anomalize.html#seasonal-decomposition-remainder", - "href": "guides/06_anomalize.html#seasonal-decomposition-remainder", - "title": "Anomaly Detection", - "section": "1.2 Seasonal Decomposition & Remainder", - "text": "1.2 Seasonal Decomposition & Remainder\nFirst we perform seasonal decomposition and on the data and generate remainders using anomalize().\n\n\n\n\n\n\nHelp Doc Info: anomalize()\n\n\n\n\n\nUse help(tk.anomalize) to review additional helpful documentation.\n\n\n\n\n# Anomalize\nanomalize_df = tk.anomalize(\n data = df,\n date_column = 'date',\n value_column = 'value',\n period = 7,\n iqr_alpha = 0.05, # using the default\n clean_alpha = 0.75, # using the default\n clean = \"min_max\"\n)\n\nanomalize_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 365 rows of 12 columns\ndate: datetime64[ns] [Timestamp('2015-01-01 00:00:00'), ...\nobserved: float64 [2351.0, 2302.7, 2300.7, 2341.2, 2 ...\nseasonal: float64 [14.163009085035995, -17.341946034 ...\nseasadj: float64 [2336.836990914964, 2320.041946034 ...\ntrend: float64 [2323.900317851228, 2322.996460334 ...\nremainder: float64 [12.93667306373618, -2.95451429904 ...\nanomaly: object ['No', 'No', 'No', 'No', 'No', 'No ...\nanomaly_score: float64 [19.42215274680143, 35.31334010958 ...\nanomaly_direction: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nrecomposed_l1: float64 [2179.860403909094, 2147.451591271 ...\nrecomposed_l2: float64 [2560.9839015845087, 2528.57508894 ...\nobserved_clean: float64 [2351.0, 2302.7, 2300.7, 2341.2, 2 ..." + "objectID": "tutorials/04_anomaly_detection.html#anomalize-breakdown-identify-and-clean-in-1-easy-step", + "href": "tutorials/04_anomaly_detection.html#anomalize-breakdown-identify-and-clean-in-1-easy-step", + "title": "Anomaly Detection in Website Traffic", + "section": "2.1 Anomalize: breakdown, identify, and clean in 1 easy step", + "text": "2.1 Anomalize: breakdown, identify, and clean in 1 easy step\nThe anomalize() function is a feature rich tool for performing anomaly detection. Anomalize is group-aware, so we can use this as part of a normal pandas groupby chain. In one easy step:\n\nWe breakdown (decompose) the time series\nAnalyze it’s remainder (residuals) for spikes (anomalies)\nClean the anomalies if desired\n\n\n\nCode\nanomalize_df = df \\\n .groupby('Page', sort = False) \\\n .anomalize(\n date_column = \"date\", \n value_column = \"value\", \n )\n\nanomalize_df.glimpse()\n\n\n\n\n\n<class 'pandas.core.frame.DataFrame'>: 5500 rows of 13 columns\nPage: object ['Death_of_Freddie_Gray_en.wikiped ...\ndate: datetime64[ns] [Timestamp('2015-07-01 00:00:00'), ...\nobserved: int64 [791, 704, 903, 732, 558, 504, 543 ...\nseasonal: float64 [206.78723511550484, 4.04332698700 ...\nseasadj: float64 [584.2127648844952, 699.9566730129 ...\ntrend: float64 [729.0301895900458, 726.0497757616 ...\nremainder: float64 [-144.8174247055506, -26.093102748 ...\nanomaly: object ['No', 'No', 'No', 'No', 'No', 'No ...\nanomaly_score: float64 [266.9421236324138, 148.2178016755 ...\nanomaly_direction: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nrecomposed_l1: float64 [266.05095141435606, 60.3266294574 ...\nrecomposed_l2: float64 [1849.8332958504716, 1644.10897389 ...\nobserved_clean: float64 [791.0, 704.0, 903.0, 732.0, 558.0 ...\n\n\n\n\n\n\n\n\nThe anomalize() function returns:\n\n\n\n\n\n\nThe original grouping and datetime columns.\nThe seasonal decomposition: observed, seasonal, seasadj, trend, and remainder. The objective is to remove trend and seasonality such that the remainder is stationary and representative of normal variation and anomalous variations.\nAnomaly identification and scoring: anomaly, anomaly_score, anomaly_direction. These identify the anomaly decision (Yes/No), score the anomaly as a distance from the centerline, and label the direction (-1 (down), zero (not anomalous), +1 (up)).\nRecomposition: recomposed_l1 and recomposed_l2. Think of these as the lower and upper bands. Any observed data that is below l1 or above l2 is anomalous.\nCleaned data: observed_clean. Cleaned data is automatically provided, which has the outliers replaced with data that is within the recomposed l1/l2 boundaries. With that said, you should always first seek to understand why data is being considered anomalous before simply removing outliers and using the cleaned data.\n\n\n\n\nThe most important aspect is that this data is ready to be visualized, inspected, and modifications can then be made to address any tweaks you would like to make." }, { - "objectID": "guides/06_anomalize.html#plot-seasonal-decomposition", - "href": "guides/06_anomalize.html#plot-seasonal-decomposition", - "title": "Anomaly Detection", - "section": "1.3 Plot Seasonal Decomposition", - "text": "1.3 Plot Seasonal Decomposition\nWe plot the seaonal decomposition to get a visual representation;\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies_decomp()\n\n\n\n\n\nUse help(tk.plot_anomalies_decomp) to review additional helpful documentation.\n\n\n\n\n# Plot seasonal decomposition\ntk.plot_anomalies_decomp(\n data = anomalize_df,\n date_column = 'date',\n engine = 'plotly',\n title = 'Seasonal Decomposition'\n)" - }, - { - "objectID": "guides/06_anomalize.html#plot-anomalies", - "href": "guides/06_anomalize.html#plot-anomalies", - "title": "Anomaly Detection", - "section": "1.4 Plot Anomalies", - "text": "1.4 Plot Anomalies\nNext we can plot the anomalies using tk.plot_anomalies();\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies()\n\n\n\n\n\nUse help(tk.plot_anomalies) to review additional helpful documentation.\n\n\n\n\n# Plot anomalies\ntk.plot_anomalies(\n data = anomalize_df,\n date_column = 'date',\n engine = 'plotly',\n title = 'Plot Anomaly Bands'\n)" - }, - { - "objectID": "guides/06_anomalize.html#plot-cleaned-anomalies", - "href": "guides/06_anomalize.html#plot-cleaned-anomalies", - "title": "Anomaly Detection", - "section": "1.5 Plot Cleaned Anomalies", - "text": "1.5 Plot Cleaned Anomalies\nFinally we can also see a plot of the data with cleaned anomalies using plot_anomalies_cleaned();\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies_cleaned()\n\n\n\n\n\nUse help(tk.plot_anomalies_cleaned) to review additional helpful documentation.\n\n\n\n\n# Plot cleaned anomalies\ntk.plot_anomalies_cleaned(\n data = anomalize_df,\n date_column = 'date'\n)" - }, - { - "objectID": "guides/06_anomalize.html#changing-parameters", - "href": "guides/06_anomalize.html#changing-parameters", - "title": "Anomaly Detection", - "section": "1.6 Changing Parameters", - "text": "1.6 Changing Parameters\nSome important parameters to hightlight in the anomalize() function include iqr_alpha.\n\n\n\n\n\n\nImportant\n\n\n\n\n\niqr_alpha controls the threshold for detecting outliers. It is the significance level used in the interquartile range (IQR) method for outlier detection. The default value is 0.05, which corresponds to a 5% significance level. A lower significance level will result in a higher threshold, which means fewer outliers will be detected. A higher significance level will result in a lower threshold, which means more outliers will be detected.\n\n\n\nLets visualize the effect of changing the iqr_alpha parameter;\n\nChanging iqr_alpha\nFirst, lets get a dataframe with multiple values for iqr_alpha;\n\n# Anomalized data with multiple iqr_alpha values\n\n# - Alpha values\niqr_alpha_values = [0.05, 0.10, 0.15, 0.20]\n\n# - Empty dataframes list\ndfs = []\n\nfor alpha in iqr_alpha_values:\n\n # - Run anomalize function\n anomalize_df = tk.anomalize(\n data = df,\n date_column = 'date',\n value_column = 'value',\n period = 7,\n iqr_alpha = alpha\n )\n\n # - Add the iqr_alpha column\n anomalize_df['iqr_alpha'] = f'iqr_alpha value of {alpha}'\n\n # - Append to the list\n dfs.append(anomalize_df)\n\n# - Concatenate all dataframes\nfinal_df = pd.concat(dfs)\n\nNow we can visualize the anomalies:\n\n\nVisualizing Grouped Anomalies (Facets)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n facet_ncol = 2\n )\n)\n\n\n \n\n\n\n\nVisualizing Grouped Anomalies (Plotly Dropdown)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n plotly_dropdown = True,\n plotly_dropdown_x = 1,\n plotly_dropdown_y = 0.60\n )\n)" - }, - { - "objectID": "guides/03_pandas_frequency.html", - "href": "guides/03_pandas_frequency.html", - "title": "Pandas Frequencies", - "section": "", - "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the pandas frequency strings within pytimetk. Once you understand key frequencies, you can apply them to manipulate time series data like a pro.\n\n\n\n\n1 Pandas Frequencies\nPandas offers a variety of frequency strings, also known as offset aliases, to define the frequency of a time series. Here are some common frequency strings used in pandas:\n\n‘B’: Business Day\n‘D’: Calendar day\n‘W’: Weekly\n‘M’: Month end\n‘BM’: Business month end\n‘MS’: Month start\n‘BMS’: Business month start\n‘Q’: Quarter end\n‘BQ’: Business quarter end\n‘QS’: Quarter start\n‘BQS’: Business quarter start\n‘A’ or ‘Y’: Year end\n‘BA’ or ‘BY’: Business year end\n‘AS’ or ‘YS’: Year start\n‘BAS’ or ‘BYS’: Business year start\n‘H’: Hourly\n‘T’ or ‘min’: Minutely\n‘S’: Secondly\n‘L’ or ‘ms’: Milliseconds\n‘U’: Microseconds\n‘N’: Nanoseconds\n\n\nCustom Frequencies:\n\nYou can also create custom frequencies by combining base frequencies, like:\n\n‘2D’: Every 2 days\n‘3W’: Every 3 weeks\n‘4H’: Every 4 hours\n‘1H30T’: Every 1 hour and 30 minutes\n\n\n\n\nCompound Frequencies:\n\nYou can combine multiple frequencies by adding them together.\n\n‘1D1H’: 1 day and 1 hour\n‘1H30T’: 1 hour and 30 minutes\n\n\n\n\nExample:\n\n\nCode\nimport pandas as pd\n\n# Creating a date range with daily frequency\ndate_range_daily = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')\n\ndate_range_daily\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10'],\n dtype='datetime64[ns]', freq='D')\n\n\n\n\nCode\n# Creating a date range with 2 days frequency\ndate_range_two_days = pd.date_range(start='2023-01-01', end='2023-01-10', freq='2D')\n\ndate_range_two_days\n\n\nDatetimeIndex(['2023-01-01', '2023-01-03', '2023-01-05', '2023-01-07',\n '2023-01-09'],\n dtype='datetime64[ns]', freq='2D')\n\n\nThese frequency strings help in resampling, creating date ranges, and handling time-series data efficiently in pandas.\n\n\n\n2 Timetk Incorporates Pandas Frequencies\nNow that you’ve seen pandas frequencies, you’ll see them pop up in many of the pytimetk functions.\n\nExample: Padding Dates\nThis example shows how to use Pandas frequencies inside of pytimetk functions.\nWe’ll use pad_by_time to show how to use freq to fill in missing dates.\n\n\nCode\n# DataFrame with missing dates\nimport pandas as pd\n\ndata = {\n # '2023-09-05' is missing\n 'datetime': ['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-06'], \n 'value': [10, 30, 40, 50, 60]\n}\n\ndf = pd.DataFrame(data)\ndf['datetime'] = pd.to_datetime(df['datetime'])\ndf\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10\n\n\n1\n2023-09-02\n30\n\n\n2\n2023-09-03\n40\n\n\n3\n2023-09-04\n50\n\n\n4\n2023-09-06\n60\n\n\n\n\n\n\n\nWe can resample to fill in the missing day using pad_by_time with freq = 'D'.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = 'D')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10.0\n\n\n1\n2023-09-02\n30.0\n\n\n2\n2023-09-03\n40.0\n\n\n3\n2023-09-04\n50.0\n\n\n4\n2023-09-05\nNaN\n\n\n5\n2023-09-06\n60.0\n\n\n\n\n\n\n\nWhat about resampling every 12 hours? Just set `freq = ‘12H’.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = '12H')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01 00:00:00\n10.0\n\n\n1\n2023-09-01 12:00:00\nNaN\n\n\n2\n2023-09-02 00:00:00\n30.0\n\n\n3\n2023-09-02 12:00:00\nNaN\n\n\n4\n2023-09-03 00:00:00\n40.0\n\n\n5\n2023-09-03 12:00:00\nNaN\n\n\n6\n2023-09-04 00:00:00\n50.0\n\n\n7\n2023-09-04 12:00:00\nNaN\n\n\n8\n2023-09-05 00:00:00\nNaN\n\n\n9\n2023-09-05 12:00:00\nNaN\n\n\n10\n2023-09-06 00:00:00\n60.0\n\n\n\n\n\n\n\nYou’ll see these pandas frequencies come up as the parameter freq in many pytimetk functions.\n\n\n\n3 Next Steps\nCheck out the Data Wrangling Guide next.\n\n\n4 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." - }, - { - "objectID": "getting-started/01_installation.html", - "href": "getting-started/01_installation.html", - "title": "Install", - "section": "", - "text": "1 Quick Install\nLet’s get you up and running with pytimetk fast with the latest stable release.\npip install pytimetk\nYou can install from GitHub with this code.\npip install git+https://github.com/business-science/pytimetk.git\n\n\n2 Next steps\nCheck out the Quick Start Guide Next.\n\n\n3 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." - }, - { - "objectID": "contributing.html", - "href": "contributing.html", - "title": "Contributing (Developer Setup)", - "section": "", - "text": "Interested in contributing?\n\n\n\n\n\nMake sure to Fork the GitHub Repo. Clone your fork. Then use poetry to install the pytimetk package.\n\n\n\n\n1 GitHub\nTo contribute, you’ll need to have a GitHub account. Then:\n\n1. Fork our pytimetk repository\nHead to our GitHub Repo and select “fork”. This makes a copied version of pytimetk for your personal use.\n\n\n2. Clone your forked version\nCloning will put your own personal version of pytimetk on your local machine. Make sure to replace [your_user_name] with your user name.\ngit clone https://github.com/[your_user_name]/pytimetk\n\n\n\n2 Poetry Environment Setup\nTo install pytimetk using Poetry, follow these steps:\n\n1. Prerequisites\nMake sure you have Python 3.9 or later installed on your system.\n\n\n2. Install Poetry\nTo install Poetry, you can use the official installer provided by Poetry. Do not use pip.\n\n\n3. Install Dependencies\nUse Poetry to install the package and its dependencies:\npoetry install\nor you can create a virtualenv with poetry and install the dependencies\npoetry shell\npoetry install\n\n\n\n3 Submit a Pull Request\n\n1. Make changes on a Branch\nMake changes in your local version on a branch where my-feature-branch is a branch you’d like to create that contains modifications.\ngit checkout -b my-feature-branch\n\n\n2. Push to your forked version of pytimetk\ngit push origin my-feature-branch\n\n\n3. Create a Pull Request\n\nGo to your forked repository on GitHub and switch to your branch.\nClick on “New pull request” and compare the changes you made with the original repository.\nFill out the pull request template with the necessary information, explaining your changes, the reason for them, and any other relevant information.\n\n\n\n4. Submit the Pull Request\n\nReview your changes and submit the pull request.\n\n\n\n\n4 Next Steps 🍻\nWe will review your PR. If all goes well, we’ll merge! And then you’ve just helped the community. 🍻" - }, - { - "objectID": "index.html", - "href": "index.html", - "title": "PyTimeTK ", - "section": "", - "text": "Time series easier, faster, more fun. Pytimetk.\n\nPyTimetk’s Mission: To make time series analysis easier, faster, and more enjoyable in Python.\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\n\n1 Introducing pytimetk: Simplifying Time Series Analysis for Everyone\nTime series analysis is fundamental in many fields, from business forecasting to scientific research. While the Python ecosystem offers tools like pandas, they sometimes can be verbose and not optimized for all operations, especially for complex time-based aggregations and visualizations.\nEnter pytimetk. Crafted with a blend of ease-of-use and computational efficiency, pytimetk significantly simplifies the process of time series manipulation and visualization. By leveraging the polars backend, you can experience speed improvements ranging from 3X to a whopping 3500X. Let’s dive into a comparative analysis.\n\n\n\n\n\n\n\n\nFeatures/Properties\npytimetk\npandas (+matplotlib)\n\n\n\n\nSpeed\n🚀 3X to 500X Faster\n🐢 Standard\n\n\nCode Simplicity\n🎉 Concise, readable syntax\n📜 Often verbose\n\n\nplot_timeseries()\n🎨 2 lines, no customization\n🎨 16 lines, customization needed\n\n\nsummarize_by_time()\n🕐 2 lines, 13.4X faster\n🕐 6 lines, 2 for-loops\n\n\npad_by_time()\n⛳ 2 lines, fills gaps in timeseries\n❌ No equivalent\n\n\nanomalize()\n📈 2 lines, detects and corrects anomalies\n❌ No equivalent\n\n\naugment_timeseries_signature()\n📅 1 line, all calendar features\n🕐 30 lines of dt extractors\n\n\naugment_rolling()\n🏎️ 10X to 3500X faster\n🐢 Slow Rolling Operations\n\n\n\nAs evident from the table, pytimetk is not just about speed; it also simplifies your codebase. For example, summarize_by_time(), converts a 6-line, double for-loop routine in pandas into a concise 2-line operation. And with the polars engine, get results 13.4X faster than pandas!\nSimilarly, plot_timeseries() dramatically streamlines the plotting process, encapsulating what would typically require 16 lines of matplotlib code into a mere 2-line command in pytimetk, without sacrificing customization or quality. And with plotly and plotnine engines, you can create interactive plots and beautiful static visualizations with just a few lines of code.\nFor calendar features, pytimetk offers augment_timeseries_signature() which cuts down on over 30 lines of pandas dt extractions. For rolling features, pytimetk offers augment_rolling(), which is 10X to 3500X faster than pandas. It also offers pad_by_time() to fill gaps in your time series data, and anomalize() to detect and correct anomalies in your time series data.\nJoin the revolution in time series analysis. Reduce your code complexity, increase your productivity, and harness the speed that pytimetk brings to your workflows.\nExplore more at our pytimetk homepage.\n\n\n2 🚀 Installation\nInstall the Latest Stable Version:\npip install pytimetk\nAlternatively, install the Development GitHub Version:\npip install git+https://github.com/business-science/pytimetk.git\n\n\n3 🏁 Quick Start: A Monthly Sales Analysis\nThis is a simple exercise to showcase the power of summarize_by_time():\n\nImport Libraries & Data\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n\nUsing summarize_by_time() for a Sales Analysis\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default). The default engine is \"pandas\". Selecting engine = \"polars\" allows us to improve the speed of the function.\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False,\n engine = \"polars\"\n )\n\n# Quickly examine each column\nsummary_category_1_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 24 rows of 3 columns\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\norder_date: datetime64[ns] [Timestamp('2011-01-01 00:00:00'), T ...\ntotal_price_sum: int64 [221490, 660555, 358855, 1075975, 45 ...\n\n\n\n\nVisualizing Sales Patterns\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\nThe default engine in “plotnine” for static plotting. Setting the engine = \"plotly\" returns an interactive plot.\n\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n smooth_frac = 0.8,\n engine = \"plotly\"\n )\n\n\n \n\n\n\n\n\n4 📚 Documentation\nNext step? Learn more with the pytimetk documentation\n\n📈 Overview\n🏁 Getting Started\n🗺️ Beginner Guides\n📘Applied Data Science Tutorials with PyTimeTK\n🏎️Speed Comparisons\n📄 API Reference\n\n\n\n5 🍻 Contributing\nInterested in helping us make this the best Python package for time series analysis? We’d love your help.\nFollow these instructions to Contribute.\n\n\n6 🏆 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." - }, - { - "objectID": "getting-started/02_quick_start.html", - "href": "getting-started/02_quick_start.html", - "title": "Quick Start", - "section": "", - "text": "This is a simple exercise to showcase the power of our 2 most popular function:\n\nsummarize_by_time()\nplot_timeseries()\n\n\n\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n\n\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" - }, - { - "objectID": "getting-started/02_quick_start.html#import-libraries-data", - "href": "getting-started/02_quick_start.html#import-libraries-data", - "title": "Quick Start", - "section": "", - "text": "First, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns" - }, - { - "objectID": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", - "href": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", - "title": "Quick Start", - "section": "", - "text": "Your company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440" - }, - { - "objectID": "getting-started/02_quick_start.html#visualizing-sales-patterns", - "href": "getting-started/02_quick_start.html#visualizing-sales-patterns", - "title": "Quick Start", - "section": "", - "text": "Now available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" - }, - { - "objectID": "guides/02_timetk_concepts.html", - "href": "guides/02_timetk_concepts.html", - "title": "PyTimeTK Basics", - "section": "", - "text": "PyTimeTK has one mission: To make time series analysis simpler, easier, and faster in Python. This goal requires some opinionated ways of treating time series in Python. We will conceptually lay out how pytimetk can help.\nLet’s first start with how to think about time series data conceptually. Time series data has 3 core properties." - }, - { - "objectID": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", - "href": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", - "title": "PyTimeTK Basics", - "section": "2.1 Type 1: Pandas DataFrame Operations", - "text": "2.1 Type 1: Pandas DataFrame Operations\nBefore we start using pytimetk, let’s make sure our data is set up properly.\n\nTimetk Data Format Compliance\n\n\n\n\n\n\n3 Core Properties Must Be Upheald\n\n\n\n\n\nA pytimetk-Compliant Pandas DataFrame must have:\n\nTime Series Index: A Time Stamp column containing datetime64 values\nValue Column(s): The value column(s) containing float or int values\nGroup Column(s): Optionally for grouped time series analysis, one or more columns containg str or categorical values (shown as an object)\n\nIf these are NOT upheld, this will impact your ability to use pytimetk DataFrame operations.\n\n\n\n\n\n\n\n\n\nInspect the DataFrame\n\n\n\n\n\nUse the tk.glimpse() method to check compliance.\n\n\n\nUsing pytimetk glimpse() method, we can see that we have a compliant data frame with a date column containing datetime64 and a value column containing float64. For grouped analysis we have the id column containing object dtype.\n\n\nCode\n# Tip: Inspect for compliance with glimpse()\nm4_daily_df.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 9743 rows of 3 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Timestamp(' ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4, 2017. ...\n\n\n\n\nGrouped Time Series Analysis with Summarize By Time\nFirst, inspect how the summarize_by_time function works by calling help().\n\n\nCode\n# Review the summarize_by_time documentation (output not shown)\nhelp(tk.summarize_by_time)\n\n\n\n\n\n\n\n\nHelp Doc Info: summarize_by_time()\n\n\n\n\n\n\nThe first parameter is data, indicating this is a DataFrame operation.\nThe Examples show different use cases for how to apply the function on a DataFrame\n\n\n\n\nLet’s test the summarize_by_time() DataFrame operation out using the grouped approach with method chaining. DataFrame operations can be used as Pandas methods with method-chaining, which allows us to more succinctly apply time series operations.\n\n\nCode\n# Grouped Summarize By Time with Method Chaining\ndf_summarized = (\n m4_daily_df\n .groupby('id')\n .summarize_by_time(\n date_column = 'date',\n value_column = 'value',\n freq = 'QS', # QS = Quarter Start\n agg_func = [\n 'mean', \n 'median', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max',\n ('range',lambda x: x.max() - x.min()),\n ],\n )\n)\n\ndf_summarized\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_q25\nvalue_q75\nvalue_max\nvalue_range\n\n\n\n\n0\nD10\n2014-07-01\n1960.078889\n1979.90\n1781.6\n1915.225\n2002.575\n2076.2\n294.6\n\n\n1\nD10\n2014-10-01\n2184.586957\n2154.05\n2022.8\n2125.075\n2274.150\n2344.9\n322.1\n\n\n2\nD10\n2015-01-01\n2309.830000\n2312.30\n2209.6\n2284.575\n2342.150\n2392.4\n182.8\n\n\n3\nD10\n2015-04-01\n2344.481319\n2333.00\n2185.1\n2301.750\n2391.000\n2499.8\n314.7\n\n\n4\nD10\n2015-07-01\n2156.754348\n2186.70\n1856.6\n1997.250\n2289.425\n2368.1\n511.5\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n105\nD500\n2011-07-01\n9727.321739\n9745.55\n8964.5\n9534.125\n10003.900\n10463.9\n1499.4\n\n\n106\nD500\n2011-10-01\n8175.565217\n7897.00\n6755.0\n7669.875\n8592.575\n9860.0\n3105.0\n\n\n107\nD500\n2012-01-01\n8291.317582\n8412.60\n7471.5\n7814.800\n8677.850\n8980.7\n1509.2\n\n\n108\nD500\n2012-04-01\n8654.020879\n8471.10\n8245.6\n8389.850\n9017.250\n9349.2\n1103.6\n\n\n109\nD500\n2012-07-01\n8770.502353\n8690.50\n8348.1\n8604.400\n8846.000\n9545.3\n1197.2\n\n\n\n\n110 rows × 9 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: summarize_by_time()\n\n\n\n\n\n\nThe data must comply with the 3 core properties (date column, value column(s), and group column(s))\nThe aggregation functions were applied by combination of group (id) and resample (Quarter Start)\nThe result was a pandas DataFrame with group column, resampled date column, and summary values (mean, median, min, 25th-quantile, etc)\n\n\n\n\n\n\nAnother DataFrame Example: Creating 29 Engineered Features\nLet’s examine another DataFrame function, tk.augment_timeseries_signature(). Feel free to inspect the documentation with help(tk.augment_timeseries_signature).\n\n\nCode\n# Creating 29 engineered features from the date column\n# Not run: help(tk.augment_timeseries_signature)\ndf_augmented = (\n m4_daily_df\n .augment_timeseries_signature(date_column = 'date')\n)\n\ndf_augmented.head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows × 32 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: augment_timeseries_signature()\n\n\n\n\n\n\nThe data must comply with the 1 of the 3 core properties (date column)\nThe result was a pandas DataFrame with 29 time series features that can be used for Machine Learning and Forecasting\n\n\n\n\n\n\nMaking Future Dates with Future Frame\nA common time series task before forecasting with machine learning models is to make a future DataFrame some length_out into the future. You can do this with tk.future_frame(). Here’s how.\n\n\nCode\n# Preparing a time series data set for Machine Learning Forecasting\nfull_augmented_df = (\n m4_daily_df \n .groupby('id')\n .future_frame('date', length_out = 365)\n .augment_timeseries_signature('date')\n)\nfull_augmented_df\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n11203 rows × 32 columns\n\n\n\nWe can then get the future data by keying in on the data with value column that is missing (np.nan).\n\n\nCode\n# Get the future data (just the observations that haven't happened yet)\nfuture_df = (\n full_augmented_df\n .query('value.isna()')\n)\nfuture_df\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n9743\nD10\n2016-05-07\nNaN\n1462579200\n2016\n2016\n0\n0\n1\n1\n...\n7\n37\n128\n0\n0\n0\n0\n0\n0\nam\n\n\n9744\nD10\n2016-05-08\nNaN\n1462665600\n2016\n2016\n0\n0\n1\n1\n...\n8\n38\n129\n1\n0\n0\n0\n0\n0\nam\n\n\n9745\nD10\n2016-05-09\nNaN\n1462752000\n2016\n2016\n0\n0\n1\n1\n...\n9\n39\n130\n0\n0\n0\n0\n0\n0\nam\n\n\n9746\nD10\n2016-05-10\nNaN\n1462838400\n2016\n2016\n0\n0\n1\n1\n...\n10\n40\n131\n0\n0\n0\n0\n0\n0\nam\n\n\n9747\nD10\n2016-05-11\nNaN\n1462924800\n2016\n2016\n0\n0\n1\n1\n...\n11\n41\n132\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n1460 rows × 32 columns" - }, - { - "objectID": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", - "href": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", - "title": "PyTimeTK Basics", - "section": "2.2 Type 2: Pandas Series Operations", - "text": "2.2 Type 2: Pandas Series Operations\nThe main difference between a DataFrame operation and a Series operation is that we are operating on an array of values from typically one of the following dtypes:\n\nTimestamps (datetime64)\nNumeric (float64 or int64)\n\nThe first argument of Series operations that operate on Timestamps will always be idx.\nLet’s take a look at one shall we? We’ll start with a common action: Making future time series from an existing time series with a regular frequency.\n\nThe Make Future Time Series Function\nSay we have a monthly sequence of timestamps. What if we want to create a forecast where we predict 12 months into the future? Well, we will need to create 12 future timestamps. Here’s how.\nFirst create a pd.date_range() with dates starting at the beginning of each month.\n\n\nCode\n# Make a monthly date range\ndates_dt = pd.date_range(\"2023-01\", \"2024-01\", freq=\"MS\")\ndates_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',\n '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',\n '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01',\n '2024-01-01'],\n dtype='datetime64[ns]', freq='MS')\n\n\nNext, use tk.make_future_timeseries() to create the next 12 timestamps in the sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas Series: Future Dates\nfuture_series = pd.Series(dates_dt).make_future_timeseries(12)\nfuture_series\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# DateTimeIndex: Future Dates\nfuture_dt = tk.make_future_timeseries(\n idx = dates_dt,\n length_out = 12\n)\nfuture_dt\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\nWe can combine the actual and future timestamps into one combined timeseries.\n\n\nCode\n# Combining the 2 series and resetting the index\ncombined_timeseries = (\n pd.concat(\n [pd.Series(dates_dt), pd.Series(future_dt)],\n axis=0\n )\n .reset_index(drop = True)\n)\n\ncombined_timeseries\n\n\n0 2023-01-01\n1 2023-02-01\n2 2023-03-01\n3 2023-04-01\n4 2023-05-01\n5 2023-06-01\n6 2023-07-01\n7 2023-08-01\n8 2023-09-01\n9 2023-10-01\n10 2023-11-01\n11 2023-12-01\n12 2024-01-01\n13 2024-02-01\n14 2024-03-01\n15 2024-04-01\n16 2024-05-01\n17 2024-06-01\n18 2024-07-01\n19 2024-08-01\n20 2024-09-01\n21 2024-10-01\n22 2024-11-01\n23 2024-12-01\n24 2025-01-01\ndtype: datetime64[ns]\n\n\nNext, we’ll take a look at how to go from an irregular time series to a regular time series.\n\n\nFlooring Dates\nAn example is tk.floor_date, which is used to round down dates. See help(tk.floor_date).\nFlooring dates is often used as part of a strategy to go from an irregular time series to regular by combining with an aggregation. Often summarize_by_time() is used (I’ll share why shortly). But conceptually, date flooring is the secret.\n\nWith FlooringWithout Flooring\n\n\n\n\nCode\n# Monthly flooring rounds dates down to 1st of the month\nm4_daily_df['date'].floor_date(unit = \"M\")\n\n\n0 2014-07-01\n1 2014-07-01\n2 2014-07-01\n3 2014-07-01\n4 2014-07-01\n ... \n9738 2014-07-01\n9739 2014-07-01\n9740 2014-07-01\n9741 2014-07-01\n9742 2014-07-01\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# Before Flooring\nm4_daily_df['date']\n\n\n0 2014-07-03\n1 2014-07-03\n2 2014-07-03\n3 2014-07-03\n4 2014-07-03\n ... \n9738 2014-07-03\n9739 2014-07-03\n9740 2014-07-03\n9741 2014-07-03\n9742 2014-07-03\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\nThis “date flooring” operation can be useful for creating date groupings.\n\n\nCode\n# Adding a date group with floor_date()\ndates_grouped_by_month = (\n m4_daily_df\n .assign(date_group = lambda x: x['date'].floor_date(\"M\"))\n)\n\ndates_grouped_by_month\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_group\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2014-07-01\n\n\n1\nD10\n2014-07-03\n2073.4\n2014-07-01\n\n\n2\nD10\n2014-07-03\n2048.7\n2014-07-01\n\n\n3\nD10\n2014-07-03\n2048.9\n2014-07-01\n\n\n4\nD10\n2014-07-03\n2006.4\n2014-07-01\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2014-07-03\n9418.8\n2014-07-01\n\n\n9739\nD500\n2014-07-03\n9365.7\n2014-07-01\n\n\n9740\nD500\n2014-07-03\n9445.9\n2014-07-01\n\n\n9741\nD500\n2014-07-03\n9497.9\n2014-07-01\n\n\n9742\nD500\n2014-07-03\n9545.3\n2014-07-01\n\n\n\n\n9743 rows × 4 columns\n\n\n\nWe can then do grouped operations.\n\n\nCode\n# Example of a grouped operation with floored dates\nsummary_df = (\n dates_grouped_by_month\n .drop('date', axis=1) \\\n .groupby(['id', 'date_group'])\n .mean() \\\n .reset_index()\n)\n\nsummary_df\n\n\n\n\n\n\n\n\n\nid\ndate_group\nvalue\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n\n\n1\nD160\n2014-07-01\n9243.155254\n\n\n2\nD410\n2014-07-01\n8259.786346\n\n\n3\nD500\n2014-07-01\n8287.728789\n\n\n\n\n\n\n\nOf course for this operation, we can do it faster with summarize_by_time() (and it’s much more flexible).\n\n\nCode\n# Summarize by time is less code and more flexible\n(\n m4_daily_df \n .groupby('id')\n .summarize_by_time(\n 'date', 'value', \n freq = \"MS\",\n agg_func = ['mean', 'median', 'min', 'max']\n )\n)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_max\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n2302.30\n1781.60\n2649.30\n\n\n1\nD160\n2014-07-01\n9243.155254\n10097.30\n1734.90\n19432.50\n\n\n2\nD410\n2014-07-01\n8259.786346\n8382.81\n6309.38\n9540.62\n\n\n3\nD500\n2014-07-01\n8287.728789\n7662.10\n4172.10\n14954.10\n\n\n\n\n\n\n\nAnd that’s the core idea behind pytimetk, writing less code and getting more.\nNext, let’s do one more function. The brother of augment_timeseries_signature()…\n\n\nThe Get Time Series Signature Function\nThis function takes a pandas Series or DateTimeIndex and returns a DataFrame containing the 29 engineered features.\nStart with either a DateTimeIndex…\n\n\nCode\ntimestamps_dt = pd.date_range(\"2023\", \"2024\", freq = \"D\")\ntimestamps_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10',\n ...\n '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-26',\n '2023-12-27', '2023-12-28', '2023-12-29', '2023-12-30',\n '2023-12-31', '2024-01-01'],\n dtype='datetime64[ns]', length=366, freq='D')\n\n\n… Or a Pandas Series.\n\n\nCode\ntimestamps_series = pd.Series(timestamps_dt)\ntimestamps_series\n\n\n0 2023-01-01\n1 2023-01-02\n2 2023-01-03\n3 2023-01-04\n4 2023-01-05\n ... \n361 2023-12-28\n362 2023-12-29\n363 2023-12-30\n364 2023-12-31\n365 2024-01-01\nLength: 366, dtype: datetime64[ns]\n\n\nAnd you can use the pandas Series function, tk.get_timeseries_signature() to create 29 features from the date sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas series: get_timeseries_signature\ntimestamps_series.get_timeseries_signature()\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows × 30 columns\n\n\n\n\n\n\n\nCode\n# DateTimeIndex: get_timeseries_signature\ntk.get_timeseries_signature(timestamps_dt)\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows × 30 columns" - }, - { - "objectID": "guides/01_visualization.html", - "href": "guides/01_visualization.html", - "title": "Data Visualization", - "section": "", - "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the plot_timeseries() for data visualization. Once you understand how it works, you can apply explore time series data easier than ever.\nThis tutorial focuses on, plot_timeseries(), a workhorse time-series plotting function that:" + "objectID": "tutorials/04_anomaly_detection.html#visualization-1-seasonal-decomposition-plot", + "href": "tutorials/04_anomaly_detection.html#visualization-1-seasonal-decomposition-plot", + "title": "Anomaly Detection in Website Traffic", + "section": "2.2 Visualization 1: Seasonal Decomposition Plot", + "text": "2.2 Visualization 1: Seasonal Decomposition Plot\nThe first step in my normal process is to analyze the seasonal decomposition. I want to see what the remainders look like, and make sure that the trend and seasonality are being removed such that the remainder is centered around zero.\n\n\n\n\n\n\nWhat to do when the remainders have trend or seasonality?\n\n\n\n\n\nWe’ll cover how to tweak the nobs of anomalize() in the next section aptly named “How to tweak the nobs on anomalize”.\n\n\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_decomp(\n date_column = \"date\", \n width = 1800,\n height = 1000,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_decomp(\n date_column = \"date\", \n width = 1800,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1800 x 1000)>" }, { - "objectID": "guides/01_visualization.html#plotting-a-single-time-series", - "href": "guides/01_visualization.html#plotting-a-single-time-series", - "title": "Data Visualization", - "section": "2.1 Plotting a Single Time Series", - "text": "2.1 Plotting a Single Time Series\nLet’s start with a popular time series, taylor_30_min, which includes energy demand in megawatts at a sampling interval of 30-minutes. This is a single time series.\n\n\nCode\n# Import a Time Series Data Set\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\ntaylor_30_min\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n\n\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n\n\n\n\n4032 rows × 2 columns\n\n\n\nThe plot_timeseries() function generates an interactive plotly chart by default.\n\nSimply provide the date variable (time-based column, date_column) and the numeric variable (value_column) that changes over time as the first 2 arguments.\nBy default, the plotting engine is plotly, which is interactive and excellent for data exploration and apps. However, if you require static plots for reports, you can set the engine to engine = ‘plotnine’ or engine = ‘matplotlib’.\n\nInteractive plot\n\n\nCode\ntaylor_30_min.plot_timeseries('date', 'value')\n\n\n\n \n\n\nStatic plot\n\n\nCode\ntaylor_30_min.plot_timeseries(\n 'date', 'value',\n engine = 'plotnine'\n)\n\n\n\n\n\n<Figure Size: (700 x 500)>" + "objectID": "tutorials/04_anomaly_detection.html#visualization-2-anomaly-detection-plot", + "href": "tutorials/04_anomaly_detection.html#visualization-2-anomaly-detection-plot", + "title": "Anomaly Detection in Website Traffic", + "section": "2.3 Visualization 2: Anomaly Detection Plot", + "text": "2.3 Visualization 2: Anomaly Detection Plot\nOnce I’m satisfied with the remainders, my next step is to visualize the anomalies. Here I’m looking to see if I need to grow or shrink the remainder l1 and l2 bands, which classify anomalies.\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 1000)>" }, { - "objectID": "guides/01_visualization.html#plotting-groups", - "href": "guides/01_visualization.html#plotting-groups", - "title": "Data Visualization", - "section": "2.2 Plotting Groups", - "text": "2.2 Plotting Groups\nNext, let’s move on to a dataset with time series groups, m4_monthly, which is a sample of 4 time series from the M4 competition that are sampled at a monthly frequency.\n\n\nCode\n# Import a Time Series Data Set\nm4_monthly = tk.load_dataset(\"m4_monthly\", parse_dates = ['date'])\nm4_monthly\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nM1\n1976-06-01\n8000\n\n\n1\nM1\n1976-07-01\n8350\n\n\n2\nM1\n1976-08-01\n8570\n\n\n3\nM1\n1976-09-01\n7700\n\n\n4\nM1\n1976-10-01\n7080\n\n\n...\n...\n...\n...\n\n\n1569\nM1000\n2015-02-01\n880\n\n\n1570\nM1000\n2015-03-01\n800\n\n\n1571\nM1000\n2015-04-01\n1140\n\n\n1572\nM1000\n2015-05-01\n970\n\n\n1573\nM1000\n2015-06-01\n1430\n\n\n\n\n1574 rows × 3 columns\n\n\n\nVisualizing grouped data is as simple as grouping the data set with groupby() before run it into the plot_timeseries() function. There are 2 methods:\n\nFacets\nPlotly Dropdown\n\n\nFacets (Subgroups on one plot)\nThis is great to see all time series in one plot. Here are the key points:\n\nGroups can be added using the pandas groupby().\nThese groups are then converted into facets.\nUsing facet_ncol = 2 returns a 2-column faceted plot.\nSetting facet_scales = \"free\" allows the x and y-axes of each plot to scale independently of the other plots.\n\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n facet_ncol = 2, \n facet_scales = \"free\"\n)\n\n\n\n \n\n\n\n\nPlotly Dropdown\nSometimes you have many groups and would prefer to see one plot per group. This can be accomplished with plotly_dropdown. You can adjust the x and y position as follows:\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n plotly_dropdown=True,\n plotly_dropdown_x=0,\n plotly_dropdown_y=1\n)\n\n\n\n \n\n\nThe groups can also be vizualized in the same plot using color_column paramenter. Let’s come back to taylor_30_min dataframe.\n\n\nCode\n# load data\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\n\n# extract the month using pandas\ntaylor_30_min['month'] = pd.to_datetime(taylor_30_min['date']).dt.month\n\n# plot groups\ntaylor_30_min.plot_timeseries(\n 'date', 'value', \n color_column = 'month'\n)" + "objectID": "tutorials/04_anomaly_detection.html#visualization-3-anomalies-cleaned-plot", + "href": "tutorials/04_anomaly_detection.html#visualization-3-anomalies-cleaned-plot", + "title": "Anomaly Detection in Website Traffic", + "section": "2.4 Visualization 3: Anomalies Cleaned Plot", + "text": "2.4 Visualization 3: Anomalies Cleaned Plot\nThere are pros and cons to cleaning anomalies. I’ll leave that discussion for another time. But, should you be interested in seeing what your data looks like cleaned (with outliers removed), this plot will help you compare before and after.\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_cleaned(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n engine = \"plotly\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_cleaned(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 1000)>" }, { "objectID": "guides/04_wrangling.html", @@ -1540,760 +1400,949 @@ "text": "3.2 Pad by Time with Grouped Time Series\npad_by_time() can also be used with grouped time series. Let’s use the stocks_daily dataset to showcase an example:\n\n\nCode\n# load dataset\nstocks_df = tk.load_dataset('stocks_daily', parse_dates = ['date'])\n\n# pad by time\nstocks_df \\\n .groupby('symbol') \\\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n ) \\\n .assign(id = lambda x: x['symbol'].ffill())\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nid\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\nAAPL\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\nAAPL\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\nAAPL\n\n\n3\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nAAPL\n\n\n4\nAAPL\n2013-01-06\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nAAPL\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNVDA\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\nNVDA\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\nNVDA\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\nNVDA\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\nNVDA\n\n\n\n\n23490 rows × 9 columns\n\n\n\nTo replace NaN with 0 in a dataframe with multiple columns:\n\n\nCode\nfrom functools import partial\n\n# columns to replace NaN with 0\ncols_to_fill = ['open', 'high', 'low', 'close', 'volume', 'adjusted']\n\n# define a function to fillna\ndef fill_na_col(df, col):\n return df[col].fillna(0)\n\n# pad by time and replace NaN with 0\nstocks_df \\\n .groupby('symbol') \\\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n ) \\\n .assign(id = lambda x: x['symbol'].ffill()) \\\n .assign(**{col: partial(fill_na_col, col=col) for col in cols_to_fill})\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nid\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\nAAPL\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\nAAPL\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\nAAPL\n\n\n3\nAAPL\n2013-01-05\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nAAPL\n\n\n4\nAAPL\n2013-01-06\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nAAPL\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\n0.000000\n0.000000\n0.000000\n0.000000\n0.0\n0.000000\nNVDA\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\nNVDA\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\nNVDA\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\nNVDA\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\nNVDA\n\n\n\n\n23490 rows × 9 columns" }, { - "objectID": "tutorials/04_anomaly_detection.html", - "href": "tutorials/04_anomaly_detection.html", - "title": "Anomaly Detection in Website Traffic", + "objectID": "guides/01_visualization.html", + "href": "guides/01_visualization.html", + "title": "Data Visualization", "section": "", - "text": "Anomalize: Breakdown, identify, and clean anomalies in 1 easy step\nAnomalies, often called outliers, are data points that deviate significantly from the general trend or pattern in the data. In the context of time series, they can appear as sudden spikes, drops, or any abrupt change in a sequence of values.\nAnomaly detection for time series is a technique used to identify unusual patterns that do not conform to expected behavior. It is especially relevant for sequential data (like stock prices, sensor data, sales data, etc.) where the temporal aspect is crucial. Anomalies can identify important events or be the cause of noise that can hinder forecasting performance." + "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the plot_timeseries() for data visualization. Once you understand how it works, you can apply explore time series data easier than ever.\nThis tutorial focuses on, plot_timeseries(), a workhorse time-series plotting function that:" }, { - "objectID": "tutorials/04_anomaly_detection.html#anomalize-breakdown-identify-and-clean-in-1-easy-step", - "href": "tutorials/04_anomaly_detection.html#anomalize-breakdown-identify-and-clean-in-1-easy-step", - "title": "Anomaly Detection in Website Traffic", - "section": "2.1 Anomalize: breakdown, identify, and clean in 1 easy step", - "text": "2.1 Anomalize: breakdown, identify, and clean in 1 easy step\nThe anomalize() function is a feature rich tool for performing anomaly detection. Anomalize is group-aware, so we can use this as part of a normal pandas groupby chain. In one easy step:\n\nWe breakdown (decompose) the time series\nAnalyze it’s remainder (residuals) for spikes (anomalies)\nClean the anomalies if desired\n\n\n\nCode\nanomalize_df = df \\\n .groupby('Page', sort = False) \\\n .anomalize(\n date_column = \"date\", \n value_column = \"value\", \n )\n\nanomalize_df.glimpse()\n\n\n\n\n\n<class 'pandas.core.frame.DataFrame'>: 5500 rows of 13 columns\nPage: object ['Death_of_Freddie_Gray_en.wikiped ...\ndate: datetime64[ns] [Timestamp('2015-07-01 00:00:00'), ...\nobserved: int64 [791, 704, 903, 732, 558, 504, 543 ...\nseasonal: float64 [206.78723511550484, 4.04332698700 ...\nseasadj: float64 [584.2127648844952, 699.9566730129 ...\ntrend: float64 [729.0301895900458, 726.0497757616 ...\nremainder: float64 [-144.8174247055506, -26.093102748 ...\nanomaly: object ['No', 'No', 'No', 'No', 'No', 'No ...\nanomaly_score: float64 [266.9421236324138, 148.2178016755 ...\nanomaly_direction: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nrecomposed_l1: float64 [266.05095141435606, 60.3266294574 ...\nrecomposed_l2: float64 [1849.8332958504716, 1644.10897389 ...\nobserved_clean: float64 [791.0, 704.0, 903.0, 732.0, 558.0 ...\n\n\n\n\n\n\n\n\nThe anomalize() function returns:\n\n\n\n\n\n\nThe original grouping and datetime columns.\nThe seasonal decomposition: observed, seasonal, seasadj, trend, and remainder. The objective is to remove trend and seasonality such that the remainder is stationary and representative of normal variation and anomalous variations.\nAnomaly identification and scoring: anomaly, anomaly_score, anomaly_direction. These identify the anomaly decision (Yes/No), score the anomaly as a distance from the centerline, and label the direction (-1 (down), zero (not anomalous), +1 (up)).\nRecomposition: recomposed_l1 and recomposed_l2. Think of these as the lower and upper bands. Any observed data that is below l1 or above l2 is anomalous.\nCleaned data: observed_clean. Cleaned data is automatically provided, which has the outliers replaced with data that is within the recomposed l1/l2 boundaries. With that said, you should always first seek to understand why data is being considered anomalous before simply removing outliers and using the cleaned data.\n\n\n\n\nThe most important aspect is that this data is ready to be visualized, inspected, and modifications can then be made to address any tweaks you would like to make." + "objectID": "guides/01_visualization.html#plotting-a-single-time-series", + "href": "guides/01_visualization.html#plotting-a-single-time-series", + "title": "Data Visualization", + "section": "2.1 Plotting a Single Time Series", + "text": "2.1 Plotting a Single Time Series\nLet’s start with a popular time series, taylor_30_min, which includes energy demand in megawatts at a sampling interval of 30-minutes. This is a single time series.\n\n\nCode\n# Import a Time Series Data Set\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\ntaylor_30_min\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n\n\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n\n\n\n\n4032 rows × 2 columns\n\n\n\nThe plot_timeseries() function generates an interactive plotly chart by default.\n\nSimply provide the date variable (time-based column, date_column) and the numeric variable (value_column) that changes over time as the first 2 arguments.\nBy default, the plotting engine is plotly, which is interactive and excellent for data exploration and apps. However, if you require static plots for reports, you can set the engine to engine = ‘plotnine’ or engine = ‘matplotlib’.\n\nInteractive plot\n\n\nCode\ntaylor_30_min.plot_timeseries('date', 'value')\n\n\n\n \n\n\nStatic plot\n\n\nCode\ntaylor_30_min.plot_timeseries(\n 'date', 'value',\n engine = 'plotnine'\n)\n\n\n\n\n\n<Figure Size: (700 x 500)>" }, { - "objectID": "tutorials/04_anomaly_detection.html#visualization-1-seasonal-decomposition-plot", - "href": "tutorials/04_anomaly_detection.html#visualization-1-seasonal-decomposition-plot", - "title": "Anomaly Detection in Website Traffic", - "section": "2.2 Visualization 1: Seasonal Decomposition Plot", - "text": "2.2 Visualization 1: Seasonal Decomposition Plot\nThe first step in my normal process is to analyze the seasonal decomposition. I want to see what the remainders look like, and make sure that the trend and seasonality are being removed such that the remainder is centered around zero.\n\n\n\n\n\n\nWhat to do when the remainders have trend or seasonality?\n\n\n\n\n\nWe’ll cover how to tweak the nobs of anomalize() in the next section aptly named “How to tweak the nobs on anomalize”.\n\n\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_decomp(\n date_column = \"date\", \n width = 1800,\n height = 1000,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_decomp(\n date_column = \"date\", \n width = 1800,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1800 x 1000)>" + "objectID": "guides/01_visualization.html#plotting-groups", + "href": "guides/01_visualization.html#plotting-groups", + "title": "Data Visualization", + "section": "2.2 Plotting Groups", + "text": "2.2 Plotting Groups\nNext, let’s move on to a dataset with time series groups, m4_monthly, which is a sample of 4 time series from the M4 competition that are sampled at a monthly frequency.\n\n\nCode\n# Import a Time Series Data Set\nm4_monthly = tk.load_dataset(\"m4_monthly\", parse_dates = ['date'])\nm4_monthly\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nM1\n1976-06-01\n8000\n\n\n1\nM1\n1976-07-01\n8350\n\n\n2\nM1\n1976-08-01\n8570\n\n\n3\nM1\n1976-09-01\n7700\n\n\n4\nM1\n1976-10-01\n7080\n\n\n...\n...\n...\n...\n\n\n1569\nM1000\n2015-02-01\n880\n\n\n1570\nM1000\n2015-03-01\n800\n\n\n1571\nM1000\n2015-04-01\n1140\n\n\n1572\nM1000\n2015-05-01\n970\n\n\n1573\nM1000\n2015-06-01\n1430\n\n\n\n\n1574 rows × 3 columns\n\n\n\nVisualizing grouped data is as simple as grouping the data set with groupby() before run it into the plot_timeseries() function. There are 2 methods:\n\nFacets\nPlotly Dropdown\n\n\nFacets (Subgroups on one plot)\nThis is great to see all time series in one plot. Here are the key points:\n\nGroups can be added using the pandas groupby().\nThese groups are then converted into facets.\nUsing facet_ncol = 2 returns a 2-column faceted plot.\nSetting facet_scales = \"free\" allows the x and y-axes of each plot to scale independently of the other plots.\n\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n facet_ncol = 2, \n facet_scales = \"free\"\n)\n\n\n\n \n\n\n\n\nPlotly Dropdown\nSometimes you have many groups and would prefer to see one plot per group. This can be accomplished with plotly_dropdown. You can adjust the x and y position as follows:\n\n\nCode\nm4_monthly.groupby('id').plot_timeseries(\n 'date', 'value', \n plotly_dropdown=True,\n plotly_dropdown_x=0,\n plotly_dropdown_y=1\n)\n\n\n\n \n\n\nThe groups can also be vizualized in the same plot using color_column paramenter. Let’s come back to taylor_30_min dataframe.\n\n\nCode\n# load data\ntaylor_30_min = tk.load_dataset(\"taylor_30_min\", parse_dates = ['date'])\n\n# extract the month using pandas\ntaylor_30_min['month'] = pd.to_datetime(taylor_30_min['date']).dt.month\n\n# plot groups\ntaylor_30_min.plot_timeseries(\n 'date', 'value', \n color_column = 'month'\n)" }, { - "objectID": "tutorials/04_anomaly_detection.html#visualization-2-anomaly-detection-plot", - "href": "tutorials/04_anomaly_detection.html#visualization-2-anomaly-detection-plot", - "title": "Anomaly Detection in Website Traffic", - "section": "2.3 Visualization 2: Anomaly Detection Plot", - "text": "2.3 Visualization 2: Anomaly Detection Plot\nOnce I’m satisfied with the remainders, my next step is to visualize the anomalies. Here I’m looking to see if I need to grow or shrink the remainder l1 and l2 bands, which classify anomalies.\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 1000)>" + "objectID": "guides/02_timetk_concepts.html", + "href": "guides/02_timetk_concepts.html", + "title": "PyTimeTK Basics", + "section": "", + "text": "PyTimeTK has one mission: To make time series analysis simpler, easier, and faster in Python. This goal requires some opinionated ways of treating time series in Python. We will conceptually lay out how pytimetk can help.\nLet’s first start with how to think about time series data conceptually. Time series data has 3 core properties." }, { - "objectID": "tutorials/04_anomaly_detection.html#visualization-3-anomalies-cleaned-plot", - "href": "tutorials/04_anomaly_detection.html#visualization-3-anomalies-cleaned-plot", - "title": "Anomaly Detection in Website Traffic", - "section": "2.4 Visualization 3: Anomalies Cleaned Plot", - "text": "2.4 Visualization 3: Anomalies Cleaned Plot\nThere are pros and cons to cleaning anomalies. I’ll leave that discussion for another time. But, should you be interested in seeing what your data looks like cleaned (with outliers removed), this plot will help you compare before and after.\n\nPlotlyPlotnine\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_cleaned(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n engine = \"plotly\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nanomalize_df \\\n .groupby(\"Page\") \\\n .plot_anomalies_cleaned(\n date_column = \"date\", \n facet_ncol = 2, \n width = 1000,\n height = 1000,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 1000)>" + "objectID": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", + "href": "guides/02_timetk_concepts.html#type-1-pandas-dataframe-operations", + "title": "PyTimeTK Basics", + "section": "2.1 Type 1: Pandas DataFrame Operations", + "text": "2.1 Type 1: Pandas DataFrame Operations\nBefore we start using pytimetk, let’s make sure our data is set up properly.\n\nTimetk Data Format Compliance\n\n\n\n\n\n\n3 Core Properties Must Be Upheald\n\n\n\n\n\nA pytimetk-Compliant Pandas DataFrame must have:\n\nTime Series Index: A Time Stamp column containing datetime64 values\nValue Column(s): The value column(s) containing float or int values\nGroup Column(s): Optionally for grouped time series analysis, one or more columns containg str or categorical values (shown as an object)\n\nIf these are NOT upheld, this will impact your ability to use pytimetk DataFrame operations.\n\n\n\n\n\n\n\n\n\nInspect the DataFrame\n\n\n\n\n\nUse the tk.glimpse() method to check compliance.\n\n\n\nUsing pytimetk glimpse() method, we can see that we have a compliant data frame with a date column containing datetime64 and a value column containing float64. For grouped analysis we have the id column containing object dtype.\n\n\nCode\n# Tip: Inspect for compliance with glimpse()\nm4_daily_df.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 9743 rows of 3 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Timestamp(' ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4, 2017. ...\n\n\n\n\nGrouped Time Series Analysis with Summarize By Time\nFirst, inspect how the summarize_by_time function works by calling help().\n\n\nCode\n# Review the summarize_by_time documentation (output not shown)\nhelp(tk.summarize_by_time)\n\n\n\n\n\n\n\n\nHelp Doc Info: summarize_by_time()\n\n\n\n\n\n\nThe first parameter is data, indicating this is a DataFrame operation.\nThe Examples show different use cases for how to apply the function on a DataFrame\n\n\n\n\nLet’s test the summarize_by_time() DataFrame operation out using the grouped approach with method chaining. DataFrame operations can be used as Pandas methods with method-chaining, which allows us to more succinctly apply time series operations.\n\n\nCode\n# Grouped Summarize By Time with Method Chaining\ndf_summarized = (\n m4_daily_df\n .groupby('id')\n .summarize_by_time(\n date_column = 'date',\n value_column = 'value',\n freq = 'QS', # QS = Quarter Start\n agg_func = [\n 'mean', \n 'median', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max',\n ('range',lambda x: x.max() - x.min()),\n ],\n )\n)\n\ndf_summarized\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_q25\nvalue_q75\nvalue_max\nvalue_range\n\n\n\n\n0\nD10\n2014-07-01\n1960.078889\n1979.90\n1781.6\n1915.225\n2002.575\n2076.2\n294.6\n\n\n1\nD10\n2014-10-01\n2184.586957\n2154.05\n2022.8\n2125.075\n2274.150\n2344.9\n322.1\n\n\n2\nD10\n2015-01-01\n2309.830000\n2312.30\n2209.6\n2284.575\n2342.150\n2392.4\n182.8\n\n\n3\nD10\n2015-04-01\n2344.481319\n2333.00\n2185.1\n2301.750\n2391.000\n2499.8\n314.7\n\n\n4\nD10\n2015-07-01\n2156.754348\n2186.70\n1856.6\n1997.250\n2289.425\n2368.1\n511.5\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n105\nD500\n2011-07-01\n9727.321739\n9745.55\n8964.5\n9534.125\n10003.900\n10463.9\n1499.4\n\n\n106\nD500\n2011-10-01\n8175.565217\n7897.00\n6755.0\n7669.875\n8592.575\n9860.0\n3105.0\n\n\n107\nD500\n2012-01-01\n8291.317582\n8412.60\n7471.5\n7814.800\n8677.850\n8980.7\n1509.2\n\n\n108\nD500\n2012-04-01\n8654.020879\n8471.10\n8245.6\n8389.850\n9017.250\n9349.2\n1103.6\n\n\n109\nD500\n2012-07-01\n8770.502353\n8690.50\n8348.1\n8604.400\n8846.000\n9545.3\n1197.2\n\n\n\n\n110 rows × 9 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: summarize_by_time()\n\n\n\n\n\n\nThe data must comply with the 3 core properties (date column, value column(s), and group column(s))\nThe aggregation functions were applied by combination of group (id) and resample (Quarter Start)\nThe result was a pandas DataFrame with group column, resampled date column, and summary values (mean, median, min, 25th-quantile, etc)\n\n\n\n\n\n\nAnother DataFrame Example: Creating 29 Engineered Features\nLet’s examine another DataFrame function, tk.augment_timeseries_signature(). Feel free to inspect the documentation with help(tk.augment_timeseries_signature).\n\n\nCode\n# Creating 29 engineered features from the date column\n# Not run: help(tk.augment_timeseries_signature)\ndf_augmented = (\n m4_daily_df\n .augment_timeseries_signature(date_column = 'date')\n)\n\ndf_augmented.head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows × 32 columns\n\n\n\n\n\n\n\n\n\nKey Takeaways: augment_timeseries_signature()\n\n\n\n\n\n\nThe data must comply with the 1 of the 3 core properties (date column)\nThe result was a pandas DataFrame with 29 time series features that can be used for Machine Learning and Forecasting\n\n\n\n\n\n\nMaking Future Dates with Future Frame\nA common time series task before forecasting with machine learning models is to make a future DataFrame some length_out into the future. You can do this with tk.future_frame(). Here’s how.\n\n\nCode\n# Preparing a time series data set for Machine Learning Forecasting\nfull_augmented_df = (\n m4_daily_df \n .groupby('id')\n .future_frame('date', length_out = 365)\n .augment_timeseries_signature('date')\n)\nfull_augmented_df\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n11203 rows × 32 columns\n\n\n\nWe can then get the future data by keying in on the data with value column that is missing (np.nan).\n\n\nCode\n# Get the future data (just the observations that haven't happened yet)\nfuture_df = (\n full_augmented_df\n .query('value.isna()')\n)\nfuture_df\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n9743\nD10\n2016-05-07\nNaN\n1462579200\n2016\n2016\n0\n0\n1\n1\n...\n7\n37\n128\n0\n0\n0\n0\n0\n0\nam\n\n\n9744\nD10\n2016-05-08\nNaN\n1462665600\n2016\n2016\n0\n0\n1\n1\n...\n8\n38\n129\n1\n0\n0\n0\n0\n0\nam\n\n\n9745\nD10\n2016-05-09\nNaN\n1462752000\n2016\n2016\n0\n0\n1\n1\n...\n9\n39\n130\n0\n0\n0\n0\n0\n0\nam\n\n\n9746\nD10\n2016-05-10\nNaN\n1462838400\n2016\n2016\n0\n0\n1\n1\n...\n10\n40\n131\n0\n0\n0\n0\n0\n0\nam\n\n\n9747\nD10\n2016-05-11\nNaN\n1462924800\n2016\n2016\n0\n0\n1\n1\n...\n11\n41\n132\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n11198\nD500\n2013-09-19\nNaN\n1379548800\n2013\n2013\n0\n0\n0\n2\n...\n19\n81\n262\n0\n0\n0\n0\n0\n0\nam\n\n\n11199\nD500\n2013-09-20\nNaN\n1379635200\n2013\n2013\n0\n0\n0\n2\n...\n20\n82\n263\n0\n0\n0\n0\n0\n0\nam\n\n\n11200\nD500\n2013-09-21\nNaN\n1379721600\n2013\n2013\n0\n0\n0\n2\n...\n21\n83\n264\n0\n0\n0\n0\n0\n0\nam\n\n\n11201\nD500\n2013-09-22\nNaN\n1379808000\n2013\n2013\n0\n0\n0\n2\n...\n22\n84\n265\n1\n0\n0\n0\n0\n0\nam\n\n\n11202\nD500\n2013-09-23\nNaN\n1379894400\n2013\n2013\n0\n0\n0\n2\n...\n23\n85\n266\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n1460 rows × 32 columns" }, { - "objectID": "tutorials/05_clustering.html", - "href": "tutorials/05_clustering.html", - "title": "Clustering", + "objectID": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", + "href": "guides/02_timetk_concepts.html#type-2-pandas-series-operations", + "title": "PyTimeTK Basics", + "section": "2.2 Type 2: Pandas Series Operations", + "text": "2.2 Type 2: Pandas Series Operations\nThe main difference between a DataFrame operation and a Series operation is that we are operating on an array of values from typically one of the following dtypes:\n\nTimestamps (datetime64)\nNumeric (float64 or int64)\n\nThe first argument of Series operations that operate on Timestamps will always be idx.\nLet’s take a look at one shall we? We’ll start with a common action: Making future time series from an existing time series with a regular frequency.\n\nThe Make Future Time Series Function\nSay we have a monthly sequence of timestamps. What if we want to create a forecast where we predict 12 months into the future? Well, we will need to create 12 future timestamps. Here’s how.\nFirst create a pd.date_range() with dates starting at the beginning of each month.\n\n\nCode\n# Make a monthly date range\ndates_dt = pd.date_range(\"2023-01\", \"2024-01\", freq=\"MS\")\ndates_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',\n '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',\n '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01',\n '2024-01-01'],\n dtype='datetime64[ns]', freq='MS')\n\n\nNext, use tk.make_future_timeseries() to create the next 12 timestamps in the sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas Series: Future Dates\nfuture_series = pd.Series(dates_dt).make_future_timeseries(12)\nfuture_series\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# DateTimeIndex: Future Dates\nfuture_dt = tk.make_future_timeseries(\n idx = dates_dt,\n length_out = 12\n)\nfuture_dt\n\n\n0 2024-02-01\n1 2024-03-01\n2 2024-04-01\n3 2024-05-01\n4 2024-06-01\n5 2024-07-01\n6 2024-08-01\n7 2024-09-01\n8 2024-10-01\n9 2024-11-01\n10 2024-12-01\n11 2025-01-01\ndtype: datetime64[ns]\n\n\n\n\n\nWe can combine the actual and future timestamps into one combined timeseries.\n\n\nCode\n# Combining the 2 series and resetting the index\ncombined_timeseries = (\n pd.concat(\n [pd.Series(dates_dt), pd.Series(future_dt)],\n axis=0\n )\n .reset_index(drop = True)\n)\n\ncombined_timeseries\n\n\n0 2023-01-01\n1 2023-02-01\n2 2023-03-01\n3 2023-04-01\n4 2023-05-01\n5 2023-06-01\n6 2023-07-01\n7 2023-08-01\n8 2023-09-01\n9 2023-10-01\n10 2023-11-01\n11 2023-12-01\n12 2024-01-01\n13 2024-02-01\n14 2024-03-01\n15 2024-04-01\n16 2024-05-01\n17 2024-06-01\n18 2024-07-01\n19 2024-08-01\n20 2024-09-01\n21 2024-10-01\n22 2024-11-01\n23 2024-12-01\n24 2025-01-01\ndtype: datetime64[ns]\n\n\nNext, we’ll take a look at how to go from an irregular time series to a regular time series.\n\n\nFlooring Dates\nAn example is tk.floor_date, which is used to round down dates. See help(tk.floor_date).\nFlooring dates is often used as part of a strategy to go from an irregular time series to regular by combining with an aggregation. Often summarize_by_time() is used (I’ll share why shortly). But conceptually, date flooring is the secret.\n\nWith FlooringWithout Flooring\n\n\n\n\nCode\n# Monthly flooring rounds dates down to 1st of the month\nm4_daily_df['date'].floor_date(unit = \"M\")\n\n\n0 2014-07-01\n1 2014-07-01\n2 2014-07-01\n3 2014-07-01\n4 2014-07-01\n ... \n9738 2014-07-01\n9739 2014-07-01\n9740 2014-07-01\n9741 2014-07-01\n9742 2014-07-01\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\n\nCode\n# Before Flooring\nm4_daily_df['date']\n\n\n0 2014-07-03\n1 2014-07-03\n2 2014-07-03\n3 2014-07-03\n4 2014-07-03\n ... \n9738 2014-07-03\n9739 2014-07-03\n9740 2014-07-03\n9741 2014-07-03\n9742 2014-07-03\nName: date, Length: 9743, dtype: datetime64[ns]\n\n\n\n\n\nThis “date flooring” operation can be useful for creating date groupings.\n\n\nCode\n# Adding a date group with floor_date()\ndates_grouped_by_month = (\n m4_daily_df\n .assign(date_group = lambda x: x['date'].floor_date(\"M\"))\n)\n\ndates_grouped_by_month\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_group\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2014-07-01\n\n\n1\nD10\n2014-07-03\n2073.4\n2014-07-01\n\n\n2\nD10\n2014-07-03\n2048.7\n2014-07-01\n\n\n3\nD10\n2014-07-03\n2048.9\n2014-07-01\n\n\n4\nD10\n2014-07-03\n2006.4\n2014-07-01\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2014-07-03\n9418.8\n2014-07-01\n\n\n9739\nD500\n2014-07-03\n9365.7\n2014-07-01\n\n\n9740\nD500\n2014-07-03\n9445.9\n2014-07-01\n\n\n9741\nD500\n2014-07-03\n9497.9\n2014-07-01\n\n\n9742\nD500\n2014-07-03\n9545.3\n2014-07-01\n\n\n\n\n9743 rows × 4 columns\n\n\n\nWe can then do grouped operations.\n\n\nCode\n# Example of a grouped operation with floored dates\nsummary_df = (\n dates_grouped_by_month\n .drop('date', axis=1) \\\n .groupby(['id', 'date_group'])\n .mean() \\\n .reset_index()\n)\n\nsummary_df\n\n\n\n\n\n\n\n\n\nid\ndate_group\nvalue\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n\n\n1\nD160\n2014-07-01\n9243.155254\n\n\n2\nD410\n2014-07-01\n8259.786346\n\n\n3\nD500\n2014-07-01\n8287.728789\n\n\n\n\n\n\n\nOf course for this operation, we can do it faster with summarize_by_time() (and it’s much more flexible).\n\n\nCode\n# Summarize by time is less code and more flexible\n(\n m4_daily_df \n .groupby('id')\n .summarize_by_time(\n 'date', 'value', \n freq = \"MS\",\n agg_func = ['mean', 'median', 'min', 'max']\n )\n)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue_mean\nvalue_median\nvalue_min\nvalue_max\n\n\n\n\n0\nD10\n2014-07-01\n2261.606825\n2302.30\n1781.60\n2649.30\n\n\n1\nD160\n2014-07-01\n9243.155254\n10097.30\n1734.90\n19432.50\n\n\n2\nD410\n2014-07-01\n8259.786346\n8382.81\n6309.38\n9540.62\n\n\n3\nD500\n2014-07-01\n8287.728789\n7662.10\n4172.10\n14954.10\n\n\n\n\n\n\n\nAnd that’s the core idea behind pytimetk, writing less code and getting more.\nNext, let’s do one more function. The brother of augment_timeseries_signature()…\n\n\nThe Get Time Series Signature Function\nThis function takes a pandas Series or DateTimeIndex and returns a DataFrame containing the 29 engineered features.\nStart with either a DateTimeIndex…\n\n\nCode\ntimestamps_dt = pd.date_range(\"2023\", \"2024\", freq = \"D\")\ntimestamps_dt\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10',\n ...\n '2023-12-23', '2023-12-24', '2023-12-25', '2023-12-26',\n '2023-12-27', '2023-12-28', '2023-12-29', '2023-12-30',\n '2023-12-31', '2024-01-01'],\n dtype='datetime64[ns]', length=366, freq='D')\n\n\n… Or a Pandas Series.\n\n\nCode\ntimestamps_series = pd.Series(timestamps_dt)\ntimestamps_series\n\n\n0 2023-01-01\n1 2023-01-02\n2 2023-01-03\n3 2023-01-04\n4 2023-01-05\n ... \n361 2023-12-28\n362 2023-12-29\n363 2023-12-30\n364 2023-12-31\n365 2024-01-01\nLength: 366, dtype: datetime64[ns]\n\n\nAnd you can use the pandas Series function, tk.get_timeseries_signature() to create 29 features from the date sequence.\n\nPandas SeriesDateTimeIndex\n\n\n\n\nCode\n# Pandas series: get_timeseries_signature\ntimestamps_series.get_timeseries_signature()\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows × 30 columns\n\n\n\n\n\n\n\nCode\n# DateTimeIndex: get_timeseries_signature\ntk.get_timeseries_signature(timestamps_dt)\n\n\n\n\n\n\n\n\n\nidx\nidx_index_num\nidx_year\nidx_year_iso\nidx_yearstart\nidx_yearend\nidx_leapyear\nidx_half\nidx_quarter\nidx_quarteryear\n...\nidx_mday\nidx_qday\nidx_yday\nidx_weekend\nidx_hour\nidx_minute\nidx_second\nidx_msecond\nidx_nsecond\nidx_am_pm\n\n\n\n\n0\n2023-01-01\n1672531200\n2023\n2022\n1\n0\n0\n1\n1\n2023Q1\n...\n1\n1\n1\n1\n0\n0\n0\n0\n0\nam\n\n\n1\n2023-01-02\n1672617600\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n2\n2\n2\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n2023-01-03\n1672704000\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n3\n3\n3\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n2023-01-04\n1672790400\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n4\n4\n4\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n2023-01-05\n1672876800\n2023\n2023\n0\n0\n0\n1\n1\n2023Q1\n...\n5\n5\n5\n0\n0\n0\n0\n0\n0\nam\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n361\n2023-12-28\n1703721600\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n28\n89\n362\n0\n0\n0\n0\n0\n0\nam\n\n\n362\n2023-12-29\n1703808000\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n29\n90\n363\n0\n0\n0\n0\n0\n0\nam\n\n\n363\n2023-12-30\n1703894400\n2023\n2023\n0\n0\n0\n2\n4\n2023Q4\n...\n30\n91\n364\n0\n0\n0\n0\n0\n0\nam\n\n\n364\n2023-12-31\n1703980800\n2023\n2023\n0\n1\n0\n2\n4\n2023Q4\n...\n31\n92\n365\n1\n0\n0\n0\n0\n0\nam\n\n\n365\n2024-01-01\n1704067200\n2024\n2024\n1\n0\n1\n1\n1\n2024Q1\n...\n1\n1\n1\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n366 rows × 30 columns" + }, + { + "objectID": "getting-started/02_quick_start.html", + "href": "getting-started/02_quick_start.html", + "title": "Quick Start", "section": "", - "text": "Coming soon…\n\n1 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." + "text": "This is a simple exercise to showcase the power of our 2 most popular function:\n\nsummarize_by_time()\nplot_timeseries()\n\n\n\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n\n\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" }, { - "objectID": "tutorials/01_sales_crm.html", - "href": "tutorials/01_sales_crm.html", - "title": "Sales Analysis", + "objectID": "getting-started/02_quick_start.html#import-libraries-data", + "href": "getting-started/02_quick_start.html#import-libraries-data", + "title": "Quick Start", "section": "", - "text": "In this tutorial, we will use pytimetk and its powerful functions to perform a time series analysis on a dataset representing bike sales. Our goal is to understand the patterns in the data and forecast future sales. You will:" + "text": "First, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns" }, { - "objectID": "tutorials/01_sales_crm.html#load-packages.", - "href": "tutorials/01_sales_crm.html#load-packages.", - "title": "Sales Analysis", - "section": "1.1 Load Packages.", - "text": "1.1 Load Packages.\nIf you do not have pytimetk installed, you can install by using\npip install pytimetk\nor for the latest features and functionality, you can install the development version.\npip install git+https://github.com/business-science/pytimetk.git\n\n\nCode\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split" + "objectID": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", + "href": "getting-started/02_quick_start.html#using-summarize_by_time-for-a-sales-analysis", + "title": "Quick Start", + "section": "", + "text": "Your company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default).\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\n\nCode\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False\n )\n\n# First 5 rows shown\nsummary_category_1_df.head()\n\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n\n\n1\nMountain\n2011-02-01\n660555\n\n\n2\nMountain\n2011-03-01\n358855\n\n\n3\nMountain\n2011-04-01\n1075975\n\n\n4\nMountain\n2011-05-01\n450440" }, { - "objectID": "tutorials/01_sales_crm.html#load-inspect-dataset", - "href": "tutorials/01_sales_crm.html#load-inspect-dataset", - "title": "Sales Analysis", - "section": "1.2 Load & inspect dataset", - "text": "1.2 Load & inspect dataset\nTo kick off our analysis, we’ll begin by importing essential libraries and accessing the ‘bike_sales’ dataset available within pytimetk’s suite of built-in datasets.\nThe Bike Sales dataset exemplifies what one might find in a CRM (Customer Relationship Management) system. CRM systems are pivotal for businesses, offering vital insights by tracking sales throughout the entire sales funnel. Such datasets are rich with transaction-level data, encompassing elements like order numbers, individual order lines, customer details, product information, and specific transaction data.\nTransactional data, such as this, inherently holds the essential components for time series analysis:\n\nTime Stamps\nAssociated Values\nDistinct Groups or Categories\n\nGiven these attributes, the Bike Sales dataset emerges as an ideal candidate for analysis using pytimetk." + "objectID": "getting-started/02_quick_start.html#visualizing-sales-patterns", + "href": "getting-started/02_quick_start.html#visualizing-sales-patterns", + "title": "Quick Start", + "section": "", + "text": "Now available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\n\n\nCode\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price',\n smooth_frac = 0.8\n )" }, { - "objectID": "tutorials/01_sales_crm.html#initial-inspection-with-tk.glimpse", - "href": "tutorials/01_sales_crm.html#initial-inspection-with-tk.glimpse", - "title": "Sales Analysis", - "section": "2.1 Initial Inspection with tk.glimpse", - "text": "2.1 Initial Inspection with tk.glimpse\nTo get a preliminary understanding of our data, let’s utilize the tk.glimpse() function from pytimetk. This will provide us with a snapshot of the available fields, their respective data types, and a sneak peek into the data entries.\n\n\nCode\ndf = tk.datasets.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 2466 rows of 13 columns\norder_id: int64 [1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, ...\norder_line: int64 [1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, ...\norder_date: datetime64[ns] [Timestamp('2011-01-07 00:00:00'), Ti ...\nquantity: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...\nprice: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\ntotal_price: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\nmodel: object ['Jekyll Carbon 2', 'Trigger Carbon 2 ...\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\ncategory_2: object ['Over Mountain', 'Over Mountain', 'T ...\nframe_material: object ['Carbon', 'Carbon', 'Aluminum', 'Car ...\nbikeshop_name: object ['Ithaca Mountain Climbers', 'Ithaca ...\ncity: object ['Ithaca', 'Ithaca', 'Kansas City', ' ...\nstate: object ['NY', 'NY', 'KS', 'KS', 'KY', 'KY', ..." + "objectID": "index.html", + "href": "index.html", + "title": "PyTimeTK ", + "section": "", + "text": "Time series easier, faster, more fun. Pytimetk.\n\nPyTimetk’s Mission: To make time series analysis easier, faster, and more enjoyable in Python.\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\n\n1 Introducing pytimetk: Simplifying Time Series Analysis for Everyone\nTime series analysis is fundamental in many fields, from business forecasting to scientific research. While the Python ecosystem offers tools like pandas, they sometimes can be verbose and not optimized for all operations, especially for complex time-based aggregations and visualizations.\nEnter pytimetk. Crafted with a blend of ease-of-use and computational efficiency, pytimetk significantly simplifies the process of time series manipulation and visualization. By leveraging the polars backend, you can experience speed improvements ranging from 3X to a whopping 3500X. Let’s dive into a comparative analysis.\n\n\n\n\n\n\n\n\nFeatures/Properties\npytimetk\npandas (+matplotlib)\n\n\n\n\nSpeed\n🚀 3X to 500X Faster\n🐢 Standard\n\n\nCode Simplicity\n🎉 Concise, readable syntax\n📜 Often verbose\n\n\nplot_timeseries()\n🎨 2 lines, no customization\n🎨 16 lines, customization needed\n\n\nsummarize_by_time()\n🕐 2 lines, 13.4X faster\n🕐 6 lines, 2 for-loops\n\n\npad_by_time()\n⛳ 2 lines, fills gaps in timeseries\n❌ No equivalent\n\n\nanomalize()\n📈 2 lines, detects and corrects anomalies\n❌ No equivalent\n\n\naugment_timeseries_signature()\n📅 1 line, all calendar features\n🕐 30 lines of dt extractors\n\n\naugment_rolling()\n🏎️ 10X to 3500X faster\n🐢 Slow Rolling Operations\n\n\n\nAs evident from the table, pytimetk is not just about speed; it also simplifies your codebase. For example, summarize_by_time(), converts a 6-line, double for-loop routine in pandas into a concise 2-line operation. And with the polars engine, get results 13.4X faster than pandas!\nSimilarly, plot_timeseries() dramatically streamlines the plotting process, encapsulating what would typically require 16 lines of matplotlib code into a mere 2-line command in pytimetk, without sacrificing customization or quality. And with plotly and plotnine engines, you can create interactive plots and beautiful static visualizations with just a few lines of code.\nFor calendar features, pytimetk offers augment_timeseries_signature() which cuts down on over 30 lines of pandas dt extractions. For rolling features, pytimetk offers augment_rolling(), which is 10X to 3500X faster than pandas. It also offers pad_by_time() to fill gaps in your time series data, and anomalize() to detect and correct anomalies in your time series data.\nJoin the revolution in time series analysis. Reduce your code complexity, increase your productivity, and harness the speed that pytimetk brings to your workflows.\nExplore more at our pytimetk homepage.\n\n\n2 🚀 Installation\nInstall the Latest Stable Version:\npip install pytimetk\nAlternatively, install the Development GitHub Version:\npip install git+https://github.com/business-science/pytimetk.git\n\n\n3 🏁 Quick Start: A Monthly Sales Analysis\nThis is a simple exercise to showcase the power of summarize_by_time():\n\nImport Libraries & Data\nFirst, import pytimetk as tk. This gets you access to the most important functions. Use tk.load_dataset() to load the “bike_sales_sample” dataset.\n\n\n\n\n\n\nAbout the Bike Sales Sample Dataset\n\n\n\n\n\nThis dataset contains “orderlines” for orders recieved. The order_date column contains timestamps. We can use this column to peform sales aggregations (e.g. total revenue).\n\n\n\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample')\ndf['order_date'] = pd.to_datetime(df['order_date'])\n\ndf \n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n\nUsing summarize_by_time() for a Sales Analysis\nYour company might be interested in sales patterns for various categories of bicycles. We can obtain a grouped monthly sales aggregation by category_1 in two lines of code:\n\nFirst use pandas’s groupby() method to group the DataFrame on category_1\nNext, use timetk’s summarize_by_time() method to apply the sum function my month start (“MS”) and use wide_format = 'False' to return the dataframe in a long format (Note long format is the default). The default engine is \"pandas\". Selecting engine = \"polars\" allows us to improve the speed of the function.\n\nThe result is the total revenue for Mountain and Road bikes by month.\n\nsummary_category_1_df = df \\\n .groupby(\"category_1\") \\\n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = 'sum',\n wide_format = False,\n engine = \"polars\"\n )\n\n# Quickly examine each column\nsummary_category_1_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 24 rows of 3 columns\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\norder_date: datetime64[ns] [Timestamp('2011-01-01 00:00:00'), T ...\ntotal_price_sum: int64 [221490, 660555, 358855, 1075975, 45 ...\n\n\n\n\nVisualizing Sales Patterns\n\n\n\n\n\n\nNow available: plot_timeseries().\n\n\n\n\n\nPlot time series is a quick and easy way to visualize time series and make professional time series plots.\n\n\n\nWith the data summarized by time, we can visualize with plot_timeseries(). pytimetk functions are groupby() aware meaning they understand if your data is grouped to do things by group. This is useful in time series where we often deal with 100s of time series groups.\nThe default engine in “plotnine” for static plotting. Setting the engine = \"plotly\" returns an interactive plot.\n\nsummary_category_1_df \\\n .groupby('category_1') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n smooth_frac = 0.8,\n engine = \"plotly\"\n )\n\n\n \n\n\n\n\n\n4 📚 Documentation\nNext step? Learn more with the pytimetk documentation\n\n📈 Overview\n🏁 Getting Started\n🗺️ Beginner Guides\n📘Applied Data Science Tutorials with PyTimeTK\n🏎️Speed Comparisons\n📄 API Reference\n\n\n\n5 🍻 Contributing\nInterested in helping us make this the best Python package for time series analysis? We’d love your help.\nFollow these instructions to Contribute.\n\n\n6 🏆 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." }, { - "objectID": "tutorials/01_sales_crm.html#data-exploration-with-tk.summarize_by_time", - "href": "tutorials/01_sales_crm.html#data-exploration-with-tk.summarize_by_time", - "title": "Sales Analysis", - "section": "2.2 Data Exploration with tk.summarize_by_time", - "text": "2.2 Data Exploration with tk.summarize_by_time\nCRM data is often bustling with activity, reflecting the myriad of transactions happening daily. Due to this high volume, the data can sometimes seem overwhelming or noisy. To derive meaningful insights, it’s essential to aggregate this data over specific time intervals. This is where tk.summarize_by_time() comes into play.\nThe tk.summarize_by_time() function offers a streamlined approach to time-based data aggregation. By defining a desired frequency and an aggregation method, this function seamlessly organizes your data. The beauty of it is its versatility; from a broad array of built-in aggregation methods and frequencies to the flexibility of integrating a custom function, it caters to a range of requirements.\n\n\n\n\n\n\nGetting to know tk.summarize_by_time()\n\n\n\n\n\nCurious about the various options it provides?\n\nClick here to see our Data Wrangling Guide\nUse help(tk.summarize_by_time) to review additional helpful documentation. And explore the plethora of possibilities!\n\n\n\n\n\nGetting Weekly Totals\nWe can quickly get totals by week with summarize_byt_time.\n\n\nCode\nweekly_totals = df.summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W'\n)\n\nweekly_totals.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\n\n\n\n\n0\n2011-01-09\n12040\n\n\n1\n2011-01-16\n151460\n\n\n2\n2011-01-23\n143850\n\n\n3\n2011-01-30\n175665\n\n\n4\n2011-02-06\n105210\n\n\n5\n2011-02-13\n250390\n\n\n6\n2011-02-20\n410595\n\n\n7\n2011-02-27\n254045\n\n\n8\n2011-03-06\n308420\n\n\n9\n2011-03-13\n45450\n\n\n\n\n\n\n\n\n\nGet Weekly Totals by Group (Category 2)\nTo better understand your data, you might want to add groups to this summary. We can include a groupby before the summarize_by_time and then aggregate our data.\n\n\nCode\n sales_by_week = df \\\n .groupby('category_2') \\\n .summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W'\n )\n\nsales_by_week.head(10)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-16\n61750\n\n\n1\nCross Country Race\n2011-01-23\n25050\n\n\n2\nCross Country Race\n2011-01-30\n56860\n\n\n3\nCross Country Race\n2011-02-06\n8740\n\n\n4\nCross Country Race\n2011-02-13\n78070\n\n\n5\nCross Country Race\n2011-02-20\n115010\n\n\n6\nCross Country Race\n2011-02-27\n64290\n\n\n7\nCross Country Race\n2011-03-06\n95070\n\n\n8\nCross Country Race\n2011-03-13\n3200\n\n\n9\nCross Country Race\n2011-03-20\n21170\n\n\n\n\n\n\n\n\n\nLong vs Wide Format\nThis long format can make it a little hard to compare the different group values visually, so instead of long-format you might want to pivot wide to view the data.\n\n\nCode\nsales_by_week_wide = df \\\n .groupby('category_2') \\\n .summarize_by_time(\n date_column = 'order_date',\n value_column = 'total_price',\n agg_func = ['sum'],\n freq = 'W',\n wide_format = True\n )\n\nsales_by_week_wide.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum_Cross Country Race\ntotal_price_sum_Cyclocross\ntotal_price_sum_Elite Road\ntotal_price_sum_Endurance Road\ntotal_price_sum_Fat Bike\ntotal_price_sum_Over Mountain\ntotal_price_sum_Sport\ntotal_price_sum_Trail\ntotal_price_sum_Triathalon\n\n\n\n\n0\n2011-01-09\n0.0\n0.0\n0.0\n0.0\n0.0\n12040.0\n0.0\n0.0\n0.0\n\n\n1\n2011-01-16\n61750.0\n1960.0\n49540.0\n11110.0\n0.0\n9170.0\n4030.0\n7450.0\n6450.0\n\n\n2\n2011-01-23\n25050.0\n3500.0\n51330.0\n47930.0\n0.0\n3840.0\n0.0\n0.0\n12200.0\n\n\n3\n2011-01-30\n56860.0\n2450.0\n43895.0\n24160.0\n0.0\n10880.0\n3720.0\n26700.0\n7000.0\n\n\n4\n2011-02-06\n8740.0\n7000.0\n35640.0\n22680.0\n3730.0\n14270.0\n980.0\n10220.0\n1950.0\n\n\n5\n2011-02-13\n78070.0\n0.0\n83780.0\n24820.0\n2130.0\n17160.0\n6810.0\n17120.0\n20500.0\n\n\n6\n2011-02-20\n115010.0\n7910.0\n79770.0\n27650.0\n26100.0\n37830.0\n10925.0\n96250.0\n9150.0\n\n\n7\n2011-02-27\n64290.0\n6650.0\n86900.0\n31900.0\n5860.0\n22070.0\n6165.0\n16410.0\n13800.0\n\n\n8\n2011-03-06\n95070.0\n2450.0\n31990.0\n47660.0\n5860.0\n82060.0\n9340.0\n26790.0\n7200.0\n\n\n9\n2011-03-13\n3200.0\n4200.0\n23110.0\n7260.0\n0.0\n5970.0\n1710.0\n0.0\n0.0\n\n\n\n\n\n\n\nYou can now observe the total sales for each product side by side. This streamlined view facilitates easy comparison between product sales." + "objectID": "contributing.html", + "href": "contributing.html", + "title": "Contributing (Developer Setup)", + "section": "", + "text": "Interested in contributing?\n\n\n\n\n\nMake sure to Fork the GitHub Repo. Clone your fork. Then use poetry to install the pytimetk package.\n\n\n\n\n1 GitHub\nTo contribute, you’ll need to have a GitHub account. Then:\n\n1. Fork our pytimetk repository\nHead to our GitHub Repo and select “fork”. This makes a copied version of pytimetk for your personal use.\n\n\n2. Clone your forked version\nCloning will put your own personal version of pytimetk on your local machine. Make sure to replace [your_user_name] with your user name.\ngit clone https://github.com/[your_user_name]/pytimetk\n\n\n\n2 Poetry Environment Setup\nTo install pytimetk using Poetry, follow these steps:\n\n1. Prerequisites\nMake sure you have Python 3.9 or later installed on your system.\n\n\n2. Install Poetry\nTo install Poetry, you can use the official installer provided by Poetry. Do not use pip.\n\n\n3. Install Dependencies\nUse Poetry to install the package and its dependencies:\npoetry install\nor you can create a virtualenv with poetry and install the dependencies\npoetry shell\npoetry install\n\n\n\n3 Submit a Pull Request\n\n1. Make changes on a Branch\nMake changes in your local version on a branch where my-feature-branch is a branch you’d like to create that contains modifications.\ngit checkout -b my-feature-branch\n\n\n2. Push to your forked version of pytimetk\ngit push origin my-feature-branch\n\n\n3. Create a Pull Request\n\nGo to your forked repository on GitHub and switch to your branch.\nClick on “New pull request” and compare the changes you made with the original repository.\nFill out the pull request template with the necessary information, explaining your changes, the reason for them, and any other relevant information.\n\n\n\n4. Submit the Pull Request\n\nReview your changes and submit the pull request.\n\n\n\n\n4 Next Steps 🍻\nWe will review your PR. If all goes well, we’ll merge! And then you’ve just helped the community. 🍻" }, { - "objectID": "tutorials/01_sales_crm.html#visualize-your-time-series-data-with-tk.plot_timeseries", - "href": "tutorials/01_sales_crm.html#visualize-your-time-series-data-with-tk.plot_timeseries", - "title": "Sales Analysis", - "section": "2.3 Visualize your time series data with tk.plot_timeseries", - "text": "2.3 Visualize your time series data with tk.plot_timeseries\nYou can now visualize the summarized data to gain a clearer insight into the prevailing trends.\n\nPlotlyPlotnine\n\n\n\n\nCode\nsales_by_week \\\n .groupby('category_2') \\\n .plot_timeseries(\n date_column = 'order_date', \n value_column = 'total_price_sum',\n title = 'Bike Sales by Category',\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n y_lab = 'Total Sales', \n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nsales_by_week \\\n .groupby('category_2') \\\n .plot_timeseries(\n date_column = 'order_date', \n value_column = 'total_price_sum',\n title = 'Bike Sales by Category',\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n y_lab = 'Total Sales', \n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 800)>\n\n\n\n\n\nThe graph showcases a pronounced uptick in sales for most of the different bike products during the summer. It’s a natural trend, aligning with our understanding that people gravitate towards biking during the balmy summer days. Conversely, as the chill of winter sets in at the year’s start and end, we observe a corresponding dip in sales.\nIt’s worth highlighting the elegance of the plot_timeseries function. Beyond just plotting raw data, it introduces a smoother, accentuating underlying trends and making them more discernible. This enhancement ensures we can effortlessly capture and comprehend the cyclical nature of bike sales throughout the year." + "objectID": "getting-started/01_installation.html", + "href": "getting-started/01_installation.html", + "title": "Install", + "section": "", + "text": "1 Quick Install\nLet’s get you up and running with pytimetk fast with the latest stable release.\npip install pytimetk\nYou can install from GitHub with this code.\npip install git+https://github.com/business-science/pytimetk.git\n\n\n2 Next steps\nCheck out the Quick Start Guide Next.\n\n\n3 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." }, { - "objectID": "tutorials/01_sales_crm.html#making-irregular-data-regular-with-tk.pad_by_time", - "href": "tutorials/01_sales_crm.html#making-irregular-data-regular-with-tk.pad_by_time", - "title": "Sales Analysis", - "section": "3.1 Making irregular data regular with tk.pad_by_time", - "text": "3.1 Making irregular data regular with tk.pad_by_time\nKicking off our journey, we’ll utilize pytimetk’s tk.pad_by_time() function. For this, grouping by the ‘category_1’ variable is recommended. Moreover, it’s prudent to establish a definitive end date. This ensures that all groups are equipped with training data up to the most recent date, accommodating scenarios where certain categories might have seen no sales in the final training week. By doing so, we create a representative observation for every group, capturing the nuances of each category’s sales pattern.\n\n\nCode\nsales_padded = sales_by_week \\\n .groupby('category_2') \\\n .pad_by_time(\n date_column = 'order_date',\n freq = 'W',\n end_date = sales_by_week.order_date.max()\n )\nsales_padded\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\n\n\n...\n...\n...\n...\n\n\n463\nTriathalon\n2011-12-04\n3200.0\n\n\n464\nTriathalon\n2011-12-11\n28350.0\n\n\n465\nTriathalon\n2011-12-18\n2700.0\n\n\n466\nTriathalon\n2011-12-25\n3900.0\n\n\n467\nTriathalon\n2012-01-01\nNaN\n\n\n\n\n468 rows × 3 columns" + "objectID": "guides/03_pandas_frequency.html", + "href": "guides/03_pandas_frequency.html", + "title": "Pandas Frequencies", + "section": "", + "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers how to use the pandas frequency strings within pytimetk. Once you understand key frequencies, you can apply them to manipulate time series data like a pro.\n\n\n\n\n1 Pandas Frequencies\nPandas offers a variety of frequency strings, also known as offset aliases, to define the frequency of a time series. Here are some common frequency strings used in pandas:\n\n‘B’: Business Day\n‘D’: Calendar day\n‘W’: Weekly\n‘M’: Month end\n‘BM’: Business month end\n‘MS’: Month start\n‘BMS’: Business month start\n‘Q’: Quarter end\n‘BQ’: Business quarter end\n‘QS’: Quarter start\n‘BQS’: Business quarter start\n‘A’ or ‘Y’: Year end\n‘BA’ or ‘BY’: Business year end\n‘AS’ or ‘YS’: Year start\n‘BAS’ or ‘BYS’: Business year start\n‘H’: Hourly\n‘T’ or ‘min’: Minutely\n‘S’: Secondly\n‘L’ or ‘ms’: Milliseconds\n‘U’: Microseconds\n‘N’: Nanoseconds\n\n\nCustom Frequencies:\n\nYou can also create custom frequencies by combining base frequencies, like:\n\n‘2D’: Every 2 days\n‘3W’: Every 3 weeks\n‘4H’: Every 4 hours\n‘1H30T’: Every 1 hour and 30 minutes\n\n\n\n\nCompound Frequencies:\n\nYou can combine multiple frequencies by adding them together.\n\n‘1D1H’: 1 day and 1 hour\n‘1H30T’: 1 hour and 30 minutes\n\n\n\n\nExample:\n\n\nCode\nimport pandas as pd\n\n# Creating a date range with daily frequency\ndate_range_daily = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')\n\ndate_range_daily\n\n\nDatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',\n '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',\n '2023-01-09', '2023-01-10'],\n dtype='datetime64[ns]', freq='D')\n\n\n\n\nCode\n# Creating a date range with 2 days frequency\ndate_range_two_days = pd.date_range(start='2023-01-01', end='2023-01-10', freq='2D')\n\ndate_range_two_days\n\n\nDatetimeIndex(['2023-01-01', '2023-01-03', '2023-01-05', '2023-01-07',\n '2023-01-09'],\n dtype='datetime64[ns]', freq='2D')\n\n\nThese frequency strings help in resampling, creating date ranges, and handling time-series data efficiently in pandas.\n\n\n\n2 Timetk Incorporates Pandas Frequencies\nNow that you’ve seen pandas frequencies, you’ll see them pop up in many of the pytimetk functions.\n\nExample: Padding Dates\nThis example shows how to use Pandas frequencies inside of pytimetk functions.\nWe’ll use pad_by_time to show how to use freq to fill in missing dates.\n\n\nCode\n# DataFrame with missing dates\nimport pandas as pd\n\ndata = {\n # '2023-09-05' is missing\n 'datetime': ['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04', '2023-09-06'], \n 'value': [10, 30, 40, 50, 60]\n}\n\ndf = pd.DataFrame(data)\ndf['datetime'] = pd.to_datetime(df['datetime'])\ndf\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10\n\n\n1\n2023-09-02\n30\n\n\n2\n2023-09-03\n40\n\n\n3\n2023-09-04\n50\n\n\n4\n2023-09-06\n60\n\n\n\n\n\n\n\nWe can resample to fill in the missing day using pad_by_time with freq = 'D'.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = 'D')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01\n10.0\n\n\n1\n2023-09-02\n30.0\n\n\n2\n2023-09-03\n40.0\n\n\n3\n2023-09-04\n50.0\n\n\n4\n2023-09-05\nNaN\n\n\n5\n2023-09-06\n60.0\n\n\n\n\n\n\n\nWhat about resampling every 12 hours? Just set `freq = ‘12H’.\n\n\nCode\nimport pytimetk as tk\n\ndf.pad_by_time('datetime', freq = '12H')\n\n\n\n\n\n\n\n\n\ndatetime\nvalue\n\n\n\n\n0\n2023-09-01 00:00:00\n10.0\n\n\n1\n2023-09-01 12:00:00\nNaN\n\n\n2\n2023-09-02 00:00:00\n30.0\n\n\n3\n2023-09-02 12:00:00\nNaN\n\n\n4\n2023-09-03 00:00:00\n40.0\n\n\n5\n2023-09-03 12:00:00\nNaN\n\n\n6\n2023-09-04 00:00:00\n50.0\n\n\n7\n2023-09-04 12:00:00\nNaN\n\n\n8\n2023-09-05 00:00:00\nNaN\n\n\n9\n2023-09-05 12:00:00\nNaN\n\n\n10\n2023-09-06 00:00:00\n60.0\n\n\n\n\n\n\n\nYou’ll see these pandas frequencies come up as the parameter freq in many pytimetk functions.\n\n\n\n3 Next Steps\nCheck out the Data Wrangling Guide next.\n\n\n4 More Coming Soon…\nWe are in the early stages of development. But it’s obvious the potential for pytimetk now in Python. 🐍\n\nPlease ⭐ us on GitHub (it takes 2-seconds and means a lot).\nTo make requests, please see our Project Roadmap GH Issue #2. You can make requests there.\nWant to contribute? See our contributing guide here." }, { - "objectID": "tutorials/01_sales_crm.html#making-future-dates-easier-with-tk.future_frame", - "href": "tutorials/01_sales_crm.html#making-future-dates-easier-with-tk.future_frame", - "title": "Sales Analysis", - "section": "3.2 Making Future Dates Easier with tk.future_frame", - "text": "3.2 Making Future Dates Easier with tk.future_frame\nMoving on, let’s set up the future frame, which will serve as our test dataset. To achieve this, employ the tk.future_frame() method. This function allows for the specification of a grouping column and a forecast horizon.\nUpon invoking tk.future_frame(), you’ll observe that placeholders (null values) are added for each group, extending 12 weeks into the future.\n\n\nCode\ndf_with_futureframe = sales_padded \\\n .groupby('category_2') \\\n .future_frame(\n date_column = 'order_date',\n length_out = 12\n )\ndf_with_futureframe\n\n\n\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\n\n\n...\n...\n...\n...\n\n\n571\nTriathalon\n2012-02-26\nNaN\n\n\n572\nTriathalon\n2012-03-04\nNaN\n\n\n573\nTriathalon\n2012-03-11\nNaN\n\n\n574\nTriathalon\n2012-03-18\nNaN\n\n\n575\nTriathalon\n2012-03-25\nNaN\n\n\n\n\n576 rows × 3 columns" + "objectID": "guides/06_anomalize.html", + "href": "guides/06_anomalize.html", + "title": "Anomaly Detection", + "section": "", + "text": "Anomaly detection in time series analysis is a crucial process for identifying unusual patterns that deviate from expected behavior. These anomalies can signify critical, often unforeseen events in time series data. Effective anomaly detection helps in maintaining the quality and reliability of data, ensuring accurate forecasting and decision-making. The challenge lies in distinguishing between true anomalies and natural fluctuations, which demands sophisticated analytical techniques and a deep understanding of the underlying time series patterns. As a result, anomaly detection is an essential component of time series analysis, driving the proactive management of risks and opportunities in dynamic environments.\nPytimetk uses the following methods to determine anomalies in time series data;\nThere are 2 common techniques for seasonal decomposition; STL and Twitter;" }, { - "objectID": "tutorials/01_sales_crm.html#lag-values-with-tk.augment_lags", - "href": "tutorials/01_sales_crm.html#lag-values-with-tk.augment_lags", - "title": "Sales Analysis", - "section": "3.3 Lag Values with tk.augment_lags", - "text": "3.3 Lag Values with tk.augment_lags\nCrafting features from time series data can be intricate, but thanks to the suite of feature engineering tools in pytimetk, the process is streamlined and intuitive.\nIn this guide, we’ll focus on the basics: introducing a few lag variables and incorporating some date-related features.\nFirstly, let’s dive into creating lag features.\nGiven our forecasting objective of a 12-week horizon, to ensure we have lag data available for every future point, we should utilize a lag of 12 or more. The beauty of the toolkit is that it supports the addition of multiple lags simultaneously.\nLag features play a pivotal role in machine learning for time series. Often, recent data offers valuable insights into future trends. To capture this recency effect, it’s crucial to integrate lag values. For this purpose, tk.augment_lags() comes in handy.\n\n\nCode\ndf_with_lags = df_with_futureframe \\\n .groupby('category_2') \\\n .augment_lags(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n lags = [12,24]\n\n )\ndf_with_lags.head(25)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\n\n\n\n\n0\nCross Country Race\n2011-01-09\nNaN\nNaN\nNaN\n\n\n1\nCross Country Race\n2011-01-16\n61750.0\nNaN\nNaN\n\n\n2\nCross Country Race\n2011-01-23\n25050.0\nNaN\nNaN\n\n\n3\nCross Country Race\n2011-01-30\n56860.0\nNaN\nNaN\n\n\n4\nCross Country Race\n2011-02-06\n8740.0\nNaN\nNaN\n\n\n5\nCross Country Race\n2011-02-13\n78070.0\nNaN\nNaN\n\n\n6\nCross Country Race\n2011-02-20\n115010.0\nNaN\nNaN\n\n\n7\nCross Country Race\n2011-02-27\n64290.0\nNaN\nNaN\n\n\n8\nCross Country Race\n2011-03-06\n95070.0\nNaN\nNaN\n\n\n9\nCross Country Race\n2011-03-13\n3200.0\nNaN\nNaN\n\n\n10\nCross Country Race\n2011-03-20\n21170.0\nNaN\nNaN\n\n\n11\nCross Country Race\n2011-03-27\n28990.0\nNaN\nNaN\n\n\n12\nCross Country Race\n2011-04-03\n51860.0\nNaN\nNaN\n\n\n13\nCross Country Race\n2011-04-10\n85910.0\n61750.0\nNaN\n\n\n14\nCross Country Race\n2011-04-17\n138230.0\n25050.0\nNaN\n\n\n15\nCross Country Race\n2011-04-24\n138350.0\n56860.0\nNaN\n\n\n16\nCross Country Race\n2011-05-01\n136090.0\n8740.0\nNaN\n\n\n17\nCross Country Race\n2011-05-08\n32110.0\n78070.0\nNaN\n\n\n18\nCross Country Race\n2011-05-15\n139010.0\n115010.0\nNaN\n\n\n19\nCross Country Race\n2011-05-22\n2060.0\n64290.0\nNaN\n\n\n20\nCross Country Race\n2011-05-29\n26130.0\n95070.0\nNaN\n\n\n21\nCross Country Race\n2011-06-05\n30360.0\n3200.0\nNaN\n\n\n22\nCross Country Race\n2011-06-12\n88280.0\n21170.0\nNaN\n\n\n23\nCross Country Race\n2011-06-19\n109470.0\n28990.0\nNaN\n\n\n24\nCross Country Race\n2011-06-26\n107280.0\n51860.0\nNaN\n\n\n\n\n\n\n\nObserve that lag values of 12 and 24 introduce missing entries at the dataset’s outset. This occurs because there isn’t available data from 12 or 24 weeks prior. To address these gaps, you can adopt one of two strategies:\n\nDiscard the Affected Rows: This is a recommended approach if your dataset is sufficiently large. Removing a few initial rows might not significantly impact the training process.\nBackfill Missing Values: In situations with limited data, you might consider backfilling these nulls using the first available values from lag 12 and 24. However, the appropriateness of this technique hinges on your specific context and objectives.\n\nFor the scope of this tutorial, we’ll opt to remove these rows. However, it’s worth pointing out that our dataset is quite small with limited historical data, so this might impact our model.\n\n\nCode\nlag_columns = [col for col in df_with_lags.columns if 'lag' in col]\ndf_no_nas = df_with_lags \\\n .dropna(subset=lag_columns, inplace=False)\n\ndf_no_nas.head()\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\n\n\n\n\n25\nCross Country Race\n2011-07-03\n56430.0\n85910.0\n61750.0\n\n\n26\nCross Country Race\n2011-07-10\n62320.0\n138230.0\n25050.0\n\n\n27\nCross Country Race\n2011-07-17\n141620.0\n138350.0\n56860.0\n\n\n28\nCross Country Race\n2011-07-24\n75720.0\n136090.0\n8740.0\n\n\n29\nCross Country Race\n2011-07-31\n21240.0\n32110.0\n78070.0" + "objectID": "guides/06_anomalize.html#setup", + "href": "guides/06_anomalize.html#setup", + "title": "Anomaly Detection", + "section": "1.1 Setup", + "text": "1.1 Setup\nTo setup, import the necessary packages and the m4_daily_df dataset;\n\n# libraries\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Import Data\nm4_daily_df = tk.load_dataset('m4_daily', parse_dates = ['date'])\n\nLet’s first demonstrate with a single time series. We’ll filter m4_daily_df for id = D10 and date within the year 2015.\n\n# Data filtering\ndf = (\n m4_daily_df\n .query(\"id == 'D10'\")\n .query(\"date.dt.year == 2015\")\n)\n\nWe can plot this data to see the trend\n\n# Plot data\ntk.plot_timeseries(\n data = df,\n date_column = 'date',\n value_column = 'value'\n)" }, { - "objectID": "tutorials/01_sales_crm.html#date-features-with-tk.augment_timeseries_signature", - "href": "tutorials/01_sales_crm.html#date-features-with-tk.augment_timeseries_signature", - "title": "Sales Analysis", - "section": "3.4 Date Features with tk.augment_timeseries_signature", - "text": "3.4 Date Features with tk.augment_timeseries_signature\nNow, let’s enrich our dataset with date-related features.\nWith the function tk.augment_timeseries_signature(), you can effortlessly append 29 date attributes to a timestamp. Given that our dataset captures weekly intervals, certain attributes like ‘hour’ may not be pertinent. Thus, it’s prudent to refine our columns, retaining only those that truly matter to our analysis.\n\n\nCode\ndf_with_datefeatures = df_no_nas \\\n .augment_timeseries_signature(date_column='order_date')\n\ndf_with_datefeatures.head(10)\n\n\n\n\n\n\n\n\n\ncategory_2\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_index_num\norder_date_year\norder_date_year_iso\norder_date_yearstart\norder_date_yearend\n...\norder_date_mday\norder_date_qday\norder_date_yday\norder_date_weekend\norder_date_hour\norder_date_minute\norder_date_second\norder_date_msecond\norder_date_nsecond\norder_date_am_pm\n\n\n\n\n25\nCross Country Race\n2011-07-03\n56430.0\n85910.0\n61750.0\n1309651200\n2011\n2011\n0\n0\n...\n3\n3\n184\n1\n0\n0\n0\n0\n0\nam\n\n\n26\nCross Country Race\n2011-07-10\n62320.0\n138230.0\n25050.0\n1310256000\n2011\n2011\n0\n0\n...\n10\n10\n191\n1\n0\n0\n0\n0\n0\nam\n\n\n27\nCross Country Race\n2011-07-17\n141620.0\n138350.0\n56860.0\n1310860800\n2011\n2011\n0\n0\n...\n17\n17\n198\n1\n0\n0\n0\n0\n0\nam\n\n\n28\nCross Country Race\n2011-07-24\n75720.0\n136090.0\n8740.0\n1311465600\n2011\n2011\n0\n0\n...\n24\n24\n205\n1\n0\n0\n0\n0\n0\nam\n\n\n29\nCross Country Race\n2011-07-31\n21240.0\n32110.0\n78070.0\n1312070400\n2011\n2011\n0\n0\n...\n31\n31\n212\n1\n0\n0\n0\n0\n0\nam\n\n\n30\nCross Country Race\n2011-08-07\n11620.0\n139010.0\n115010.0\n1312675200\n2011\n2011\n0\n0\n...\n7\n38\n219\n1\n0\n0\n0\n0\n0\nam\n\n\n31\nCross Country Race\n2011-08-14\n9730.0\n2060.0\n64290.0\n1313280000\n2011\n2011\n0\n0\n...\n14\n45\n226\n1\n0\n0\n0\n0\n0\nam\n\n\n32\nCross Country Race\n2011-08-21\n22780.0\n26130.0\n95070.0\n1313884800\n2011\n2011\n0\n0\n...\n21\n52\n233\n1\n0\n0\n0\n0\n0\nam\n\n\n33\nCross Country Race\n2011-08-28\n53680.0\n30360.0\n3200.0\n1314489600\n2011\n2011\n0\n0\n...\n28\n59\n240\n1\n0\n0\n0\n0\n0\nam\n\n\n34\nCross Country Race\n2011-09-04\n38360.0\n88280.0\n21170.0\n1315094400\n2011\n2011\n0\n0\n...\n4\n66\n247\n1\n0\n0\n0\n0\n0\nam\n\n\n\n\n10 rows × 34 columns\n\n\n\nWe can quickly get a sense of what features were just created using tk.glimpse.\n\n\nCode\ndf_with_datefeatures.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 34 columns\ncategory_2: object ['Cross Country Race', 'Cros ...\norder_date: datetime64[ns] [Timestamp('2011-07-03 00:00 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141620.0, ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 138350.0 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 56860.0, ...\norder_date_index_num: int64 [1309651200, 1310256000, 131 ...\norder_date_year: int64 [2011, 2011, 2011, 2011, 201 ...\norder_date_year_iso: UInt32 [2011, 2011, 2011, 2011, 201 ...\norder_date_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_leapyear: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, 3, 3, ...\norder_date_quarteryear: object ['2011Q3', '2011Q3', '2011Q3 ...\norder_date_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, 8, 8, ...\norder_date_month_lbl: object ['July', 'July', 'July', 'Ju ...\norder_date_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_monthend: uint8 [0, 0, 0, 0, 1, 0, 0, 0, 0, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 31, 32, ...\norder_date_mweek: int64 [1, 2, 3, 4, 5, 1, 2, 3, 4, ...\norder_date_wday: int64 [7, 7, 7, 7, 7, 7, 7, 7, 7, ...\norder_date_wday_lbl: object ['Sunday', 'Sunday', 'Sunday ...\norder_date_mday: int64 [3, 10, 17, 24, 31, 7, 14, 2 ...\norder_date_qday: int64 [3, 10, 17, 24, 31, 38, 45, ...\norder_date_yday: int64 [184, 191, 198, 205, 212, 21 ...\norder_date_weekend: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, ...\norder_date_hour: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_minute: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_second: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_msecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_nsecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\norder_date_am_pm: object ['am', 'am', 'am', 'am', 'am ...\n\n\nLet’s subset to just a few of the relevant date features. Let’s use tk.glimpse again.\n\n\nCode\ndf_with_datefeatures_narrom = df_with_datefeatures[[\n 'order_date', \n 'category_2', \n 'total_price_sum',\n 'total_price_sum_lag_12',\n 'total_price_sum_lag_24',\n 'order_date_year', \n 'order_date_half', \n 'order_date_quarter', \n 'order_date_month',\n 'order_date_yweek'\n]]\n\ndf_with_datefeatures_narrom.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 10 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 00:00: ...\ncategory_2: object ['Cross Country Race', 'Cross ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141620.0, ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 138350.0, ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 56860.0, 8 ...\norder_date_year: int64 [2011, 2011, 2011, 2011, 2011 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, 3, 3, 3 ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, 8, 8, 9 ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 31, 32, ...\n\n\n\nOne-Hot Encoding\nThe final phase in our feature engineering journey is one-hot encoding our categorical variables. While certain machine learning models like CatBoost can natively handle categorical data, many cannot. Enter one-hot encoding, a technique that transforms each category within a column into its separate column, marking its presence with a ‘1’ or absence with a ‘0’.\nFor this transformation, the handy pd.get_dummies() function from pandas comes to the rescue.\n\n\nCode\ndf_encoded = pd.get_dummies(df_with_datefeatures_narrom, columns=['category_2'])\n\ndf_encoded.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 341 rows of 18 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141 ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 13 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 568 ...\norder_date_year: int64 [2011, 2011, 2011, 201 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 3 ...\ncategory_2_Cross Country Race: uint8 [1, 1, 1, 1, 1, 1, 1, ...\ncategory_2_Cyclocross: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Elite Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Endurance Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Fat Bike: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Over Mountain: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Sport: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Trail: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Triathalon: uint8 [0, 0, 0, 0, 0, 0, 0, ...\n\n\n\n\nTraining and Future Feature Sets\nPytimetk offers an extensive array of feature engineering tools and augmentation functions, giving you a broad spectrum of possibilities. However, for the purposes of this tutorial, let’s shift our focus to modeling.\nLet’s proceed by segmenting our dataframe into training and future sets.\n\n\nCode\nfuture = df_encoded[df_encoded.total_price_sum.isnull()]\ntrain = df_encoded[df_encoded.total_price_sum.notnull()]\n\n\nLet’s focus on the columns essential for training. You’ll observe that we’ve excluded the ‘order_date’ column. This is because numerous machine learning models struggle with date data types. This is precisely why we utilized the tk.augment_timeseries_signature earlier—to transform date features into a format that’s compatible with ML models.\nWe can quickly see what features we have available with tk.glimpse().\n\n\nCode\ntrain.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 233 rows of 18 columns\norder_date: datetime64[ns] [Timestamp('2011-07-03 ...\ntotal_price_sum: float64 [56430.0, 62320.0, 141 ...\ntotal_price_sum_lag_12: float64 [85910.0, 138230.0, 13 ...\ntotal_price_sum_lag_24: float64 [61750.0, 25050.0, 568 ...\norder_date_year: int64 [2011, 2011, 2011, 201 ...\norder_date_half: int64 [2, 2, 2, 2, 2, 2, 2, ...\norder_date_quarter: int64 [3, 3, 3, 3, 3, 3, 3, ...\norder_date_month: int64 [7, 7, 7, 7, 7, 8, 8, ...\norder_date_yweek: UInt32 [26, 27, 28, 29, 30, 3 ...\ncategory_2_Cross Country Race: uint8 [1, 1, 1, 1, 1, 1, 1, ...\ncategory_2_Cyclocross: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Elite Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Endurance Road: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Fat Bike: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Over Mountain: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Sport: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Trail: uint8 [0, 0, 0, 0, 0, 0, 0, ...\ncategory_2_Triathalon: uint8 [0, 0, 0, 0, 0, 0, 0, ..." + "objectID": "guides/06_anomalize.html#seasonal-decomposition-remainder", + "href": "guides/06_anomalize.html#seasonal-decomposition-remainder", + "title": "Anomaly Detection", + "section": "1.2 Seasonal Decomposition & Remainder", + "text": "1.2 Seasonal Decomposition & Remainder\nFirst we perform seasonal decomposition and on the data and generate remainders using anomalize().\n\n\n\n\n\n\nHelp Doc Info: anomalize()\n\n\n\n\n\nUse help(tk.anomalize) to review additional helpful documentation.\n\n\n\n\n# Anomalize\nanomalize_df = tk.anomalize(\n data = df,\n date_column = 'date',\n value_column = 'value',\n period = 7,\n iqr_alpha = 0.05, # using the default\n clean_alpha = 0.75, # using the default\n clean = \"min_max\"\n)\n\nanomalize_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 365 rows of 12 columns\ndate: datetime64[ns] [Timestamp('2015-01-01 00:00:00'), ...\nobserved: float64 [2351.0, 2302.7, 2300.7, 2341.2, 2 ...\nseasonal: float64 [14.163009085035995, -17.341946034 ...\nseasadj: float64 [2336.836990914964, 2320.041946034 ...\ntrend: float64 [2323.900317851228, 2322.996460334 ...\nremainder: float64 [12.93667306373618, -2.95451429904 ...\nanomaly: object ['No', 'No', 'No', 'No', 'No', 'No ...\nanomaly_score: float64 [19.42215274680143, 35.31334010958 ...\nanomaly_direction: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nrecomposed_l1: float64 [2179.860403909094, 2147.451591271 ...\nrecomposed_l2: float64 [2560.9839015845087, 2528.57508894 ...\nobserved_clean: float64 [2351.0, 2302.7, 2300.7, 2341.2, 2 ..." }, { - "objectID": "tutorials/01_sales_crm.html#scikit-learn-model", - "href": "tutorials/01_sales_crm.html#scikit-learn-model", - "title": "Sales Analysis", - "section": "3.5 Scikit Learn Model", - "text": "3.5 Scikit Learn Model\nNow for some machine learning.\n\nFitting a Random Forest Regressor\nLet’s create a RandomForestRegressor to predict future sales patterns.\n\ntrain_columns = [ 'total_price_sum_lag_12',\n 'total_price_sum_lag_24', 'order_date_year', 'order_date_half',\n 'order_date_quarter', 'order_date_month', 'order_date_yweek','category_2_Cross Country Race', 'category_2_Cyclocross',\n 'category_2_Elite Road', 'category_2_Endurance Road',\n 'category_2_Fat Bike', 'category_2_Over Mountain', 'category_2_Sport',\n 'category_2_Trail', 'category_2_Triathalon']\nX = train[train_columns]\ny = train[['total_price_sum']]\n\nmodel = RandomForestRegressor(random_state=123)\nmodel = model.fit(X, y)\n\n\n\nPrediction\nWe now have a fitted model, and can use this to predict sales from our future frame.\n\n\nCode\npredicted_values = model.predict(future[train_columns])\nfuture['y_pred'] = predicted_values\n\nfuture.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ncategory_2_Cross Country Race\ncategory_2_Cyclocross\ncategory_2_Elite Road\ncategory_2_Endurance Road\ncategory_2_Fat Bike\ncategory_2_Over Mountain\ncategory_2_Sport\ncategory_2_Trail\ncategory_2_Triathalon\ny_pred\n\n\n\n\n468\n2012-01-08\nNaN\n51820.0\n75720.0\n2012\n1\n1\n1\n1\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59462.00\n\n\n469\n2012-01-15\nNaN\n62940.0\n21240.0\n2012\n1\n1\n1\n2\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59149.45\n\n\n470\n2012-01-22\nNaN\n9060.0\n11620.0\n2012\n1\n1\n1\n3\n1\n0\n0\n0\n0\n0\n0\n0\n0\n20458.40\n\n\n471\n2012-01-29\nNaN\n15980.0\n9730.0\n2012\n1\n1\n1\n4\n1\n0\n0\n0\n0\n0\n0\n0\n0\n31914.00\n\n\n472\n2012-02-05\nNaN\n59180.0\n22780.0\n2012\n1\n1\n2\n5\n1\n0\n0\n0\n0\n0\n0\n0\n0\n59128.95\n\n\n473\n2012-02-12\nNaN\n132550.0\n53680.0\n2012\n1\n1\n2\n6\n1\n0\n0\n0\n0\n0\n0\n0\n0\n76397.50\n\n\n474\n2012-02-19\nNaN\n68430.0\n38360.0\n2012\n1\n1\n2\n7\n1\n0\n0\n0\n0\n0\n0\n0\n0\n63497.80\n\n\n475\n2012-02-26\nNaN\n29470.0\n90290.0\n2012\n1\n1\n2\n8\n1\n0\n0\n0\n0\n0\n0\n0\n0\n57332.00\n\n\n476\n2012-03-04\nNaN\n71080.0\n7380.0\n2012\n1\n1\n3\n9\n1\n0\n0\n0\n0\n0\n0\n0\n0\n60981.30\n\n\n477\n2012-03-11\nNaN\n9800.0\n0.0\n2012\n1\n1\n3\n10\n1\n0\n0\n0\n0\n0\n0\n0\n0\n18738.15\n\n\n\n\n\n\n\n\n\nCleaning Up\nNow let us do a little cleanup. For ease in plotting later, let’s add a column to track the actuals vs. the predicted values.\n\n\nCode\ntrain['type'] = 'actuals'\nfuture['type'] = 'prediction'\n\nfull_df = pd.concat([train, future])\n\nfull_df.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ncategory_2_Cross Country Race\ncategory_2_Cyclocross\ncategory_2_Elite Road\ncategory_2_Endurance Road\ncategory_2_Fat Bike\ncategory_2_Over Mountain\ncategory_2_Sport\ncategory_2_Trail\ncategory_2_Triathalon\ntype\ny_pred\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\n1\n0\n0\n0\n0\n0\n0\n0\n0\nactuals\nNaN\n\n\n\n\n\n\n\nYou can get the grouping category back from the one-hot encoding for easier plotting. For simplicity, we will search for any column with ‘category’ in its name.\n\n\nCode\n# Extract dummy columns\ndummy_cols = [col for col in full_df.columns if 'category' in col.lower() ]\nfull_df_reverted = full_df.copy()\n\n# Convert dummy columns back to categorical column\nfull_df_reverted['category'] = full_df_reverted[dummy_cols].idxmax(axis=1).str.replace(\"A_\", \"\")\n\n# Drop dummy columns\nfull_df_reverted = full_df_reverted.drop(columns=dummy_cols)\n\nfull_df_reverted.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ntype\ny_pred\ncategory\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n\n\n\n\n\n\n\nPre-Visualization Wrangling\nBefore we proceed to visualization, let’s streamline our dataset by aligning our predicted values with the actuals. This approach will simplify the plotting process. Given that our DataFrame columns are already labeled as ‘actuals’ and ‘predictions’, a brief conditional check will allow us to consolidate the necessary values.\n\n\nCode\nfull_df_reverted['total_price_sum'] = np.where(full_df_reverted.type =='actuals', full_df_reverted.total_price_sum, full_df_reverted.y_pred)\n\nfull_df_reverted.head(10)\n\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum\ntotal_price_sum_lag_12\ntotal_price_sum_lag_24\norder_date_year\norder_date_half\norder_date_quarter\norder_date_month\norder_date_yweek\ntype\ny_pred\ncategory\n\n\n\n\n25\n2011-07-03\n56430.0\n85910.0\n61750.0\n2011\n2\n3\n7\n26\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n26\n2011-07-10\n62320.0\n138230.0\n25050.0\n2011\n2\n3\n7\n27\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n27\n2011-07-17\n141620.0\n138350.0\n56860.0\n2011\n2\n3\n7\n28\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n28\n2011-07-24\n75720.0\n136090.0\n8740.0\n2011\n2\n3\n7\n29\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n29\n2011-07-31\n21240.0\n32110.0\n78070.0\n2011\n2\n3\n7\n30\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n30\n2011-08-07\n11620.0\n139010.0\n115010.0\n2011\n2\n3\n8\n31\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n31\n2011-08-14\n9730.0\n2060.0\n64290.0\n2011\n2\n3\n8\n32\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n32\n2011-08-21\n22780.0\n26130.0\n95070.0\n2011\n2\n3\n8\n33\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n33\n2011-08-28\n53680.0\n30360.0\n3200.0\n2011\n2\n3\n8\n34\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n34\n2011-09-04\n38360.0\n88280.0\n21170.0\n2011\n2\n3\n9\n35\nactuals\nNaN\ncategory_2_Cross Country Race\n\n\n\n\n\n\n\n\n\nVisualize the Forecast\nLet’s again use tk.plot_timeseries() to visually inspect the forecasts.\n\nPlotlyPlotnine\n\n\n\n\nCode\nfull_df_reverted \\\n .groupby('category') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nfull_df_reverted \\\n .groupby('category') \\\n .plot_timeseries(\n date_column = 'order_date',\n value_column = 'total_price_sum',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2, \n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1000,\n height = 800,\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (1000 x 800)>\n\n\n\n\n\nUpon examining the graph, our models look alright given the length of time for training. Important points:\n\nFor effective time series forecasting, having multiple years of data is pivotal. This provides the model ample opportunities to recognize and adapt to seasonal variations.\nGiven our dataset spanned less than a year, the model lacked the depth of historical context to discern such patterns.\nAlthough our feature engineering was kept basic to introduce various pytimetk capabilities, there’s room for enhancement.\nFor a more refined analysis, consider experimenting with different machine learning models and diving deeper into feature engineering.\nPytimetk’s tk.augment_fourier() might assist in discerning seasonal trends, but with the dataset’s limited historical scope, capturing intricate patterns could remain a challenge." + "objectID": "guides/06_anomalize.html#plot-seasonal-decomposition", + "href": "guides/06_anomalize.html#plot-seasonal-decomposition", + "title": "Anomaly Detection", + "section": "1.3 Plot Seasonal Decomposition", + "text": "1.3 Plot Seasonal Decomposition\nWe plot the seaonal decomposition to get a visual representation;\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies_decomp()\n\n\n\n\n\nUse help(tk.plot_anomalies_decomp) to review additional helpful documentation.\n\n\n\n\n# Plot seasonal decomposition\ntk.plot_anomalies_decomp(\n data = anomalize_df,\n date_column = 'date',\n engine = 'plotly',\n title = 'Seasonal Decomposition'\n)" }, { - "objectID": "performance/01_speed_comparisons.html", - "href": "performance/01_speed_comparisons.html", - "title": "Speed Comparisons", + "objectID": "guides/06_anomalize.html#plot-anomalies", + "href": "guides/06_anomalize.html#plot-anomalies", + "title": "Anomaly Detection", + "section": "1.4 Plot Anomalies", + "text": "1.4 Plot Anomalies\nNext we can plot the anomalies using tk.plot_anomalies();\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies()\n\n\n\n\n\nUse help(tk.plot_anomalies) to review additional helpful documentation.\n\n\n\n\n# Plot anomalies\ntk.plot_anomalies(\n data = anomalize_df,\n date_column = 'date',\n engine = 'plotly',\n title = 'Plot Anomaly Bands'\n)" + }, + { + "objectID": "guides/06_anomalize.html#plot-cleaned-anomalies", + "href": "guides/06_anomalize.html#plot-cleaned-anomalies", + "title": "Anomaly Detection", + "section": "1.5 Plot Cleaned Anomalies", + "text": "1.5 Plot Cleaned Anomalies\nFinally we can also see a plot of the data with cleaned anomalies using plot_anomalies_cleaned();\n\n\n\n\n\n\nHelp Doc Info: plot_anomalies_cleaned()\n\n\n\n\n\nUse help(tk.plot_anomalies_cleaned) to review additional helpful documentation.\n\n\n\n\n# Plot cleaned anomalies\ntk.plot_anomalies_cleaned(\n data = anomalize_df,\n date_column = 'date'\n)" + }, + { + "objectID": "guides/06_anomalize.html#changing-parameters", + "href": "guides/06_anomalize.html#changing-parameters", + "title": "Anomaly Detection", + "section": "1.6 Changing Parameters", + "text": "1.6 Changing Parameters\nSome important parameters to hightlight in the anomalize() function include iqr_alpha.\n\n\n\n\n\n\nImportant\n\n\n\n\n\niqr_alpha controls the threshold for detecting outliers. It is the significance level used in the interquartile range (IQR) method for outlier detection. The default value is 0.05, which corresponds to a 5% significance level. A lower significance level will result in a higher threshold, which means fewer outliers will be detected. A higher significance level will result in a lower threshold, which means more outliers will be detected.\n\n\n\nLets visualize the effect of changing the iqr_alpha parameter;\n\nChanging iqr_alpha\nFirst, lets get a dataframe with multiple values for iqr_alpha;\n\n# Anomalized data with multiple iqr_alpha values\n\n# - Alpha values\niqr_alpha_values = [0.05, 0.10, 0.15, 0.20]\n\n# - Empty dataframes list\ndfs = []\n\nfor alpha in iqr_alpha_values:\n\n # - Run anomalize function\n anomalize_df = tk.anomalize(\n data = df,\n date_column = 'date',\n value_column = 'value',\n period = 7,\n iqr_alpha = alpha\n )\n\n # - Add the iqr_alpha column\n anomalize_df['iqr_alpha'] = f'iqr_alpha value of {alpha}'\n\n # - Append to the list\n dfs.append(anomalize_df)\n\n# - Concatenate all dataframes\nfinal_df = pd.concat(dfs)\n\nNow we can visualize the anomalies:\n\n\nVisualizing Grouped Anomalies (Facets)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n facet_ncol = 2\n )\n)\n\n\n \n\n\n\n\nVisualizing Grouped Anomalies (Plotly Dropdown)\n\n# Visualize\n(\n final_df\n .groupby('iqr_alpha')\n .plot_anomalies(\n date_column = 'date',\n engine = 'plotly',\n plotly_dropdown = True,\n plotly_dropdown_x = 1,\n plotly_dropdown_y = 0.60\n )\n)" + }, + { + "objectID": "guides/05_augmenting.html", + "href": "guides/05_augmenting.html", + "title": "Adding Features (Augmenting)", "section": "", - "text": "How this guide benefits you\n\n\n\n\n\nThis guide covers speed and performance comparisons using the polars backend.\nBeginning in version 0.2.0 of pytimetk, we introduced new polars engines to many of our functions. This is aimed at leveraging the speed benefits of polars without requiring you (the user) to learn a new data manipulation framework." + "text": "This section will cover the augment set of functions, use to add many additional time series features to a dataset. We’ll cover how to use the following set of functions" }, { - "objectID": "performance/01_speed_comparisons.html#key-benefits", - "href": "performance/01_speed_comparisons.html#key-benefits", - "title": "Speed Comparisons", - "section": "2.1 Key benefits:", - "text": "2.1 Key benefits:\n\nYou can get between 2X and 500X speed boost on many common time series operations\nYou don’t need to know how to use polars to gain massive speed boosts\nSimply turn engine = 'polars' to get the speed boost." + "objectID": "guides/05_augmenting.html#basic-examples", + "href": "guides/05_augmenting.html#basic-examples", + "title": "Adding Features (Augmenting)", + "section": "1.1 Basic Examples", + "text": "1.1 Basic Examples\nAdd 1 or more lags / leads to a dataset:\n\n\nCode\n# import libraries\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\nimport random\n\n# create sample data\ndates = pd.date_range(start = '2023-09-18', end = '2023-09-24')\nvalues = [random.randint(10, 50) for _ in range(7)]\n\ndf = pd.DataFrame({\n 'date': dates,\n 'value': values\n})\n\ndf\n\n\n\n\n\n\n\n\n\ndate\nvalue\n\n\n\n\n0\n2023-09-18\n25\n\n\n1\n2023-09-19\n50\n\n\n2\n2023-09-20\n49\n\n\n3\n2023-09-21\n45\n\n\n4\n2023-09-22\n48\n\n\n5\n2023-09-23\n18\n\n\n6\n2023-09-24\n18\n\n\n\n\n\n\n\nCreate lag / lead of 3 days:\n\nLagLead\n\n\n\n\nCode\n# augment lag\ndf \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = 3\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lag_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\nNaN\n\n\n3\n2023-09-21\n45\n25.0\n\n\n4\n2023-09-22\n48\n50.0\n\n\n5\n2023-09-23\n18\n49.0\n\n\n6\n2023-09-24\n18\n45.0\n\n\n\n\n\n\n\n\n\n\n\nCode\n# augment leads\ndf \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = 3\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lead_3\n\n\n\n\n0\n2023-09-18\n25\n45.0\n\n\n1\n2023-09-19\n50\n48.0\n\n\n2\n2023-09-20\n49\n18.0\n\n\n3\n2023-09-21\n45\n18.0\n\n\n4\n2023-09-22\n48\nNaN\n\n\n5\n2023-09-23\n18\nNaN\n\n\n6\n2023-09-24\n18\nNaN\n\n\n\n\n\n\n\n\n\n\nWe can create multiple lag / lead values for a single time series:\n\nLagLead\n\n\n\n\nCode\n# multiple lagged values for a single time series\ndf \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = (1, 3)\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lag_1\nvalue_lag_2\nvalue_lag_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\nNaN\nNaN\n\n\n1\n2023-09-19\n50\n25.0\nNaN\nNaN\n\n\n2\n2023-09-20\n49\n50.0\n25.0\nNaN\n\n\n3\n2023-09-21\n45\n49.0\n50.0\n25.0\n\n\n4\n2023-09-22\n48\n45.0\n49.0\n50.0\n\n\n5\n2023-09-23\n18\n48.0\n45.0\n49.0\n\n\n6\n2023-09-24\n18\n18.0\n48.0\n45.0\n\n\n\n\n\n\n\n\n\n\n\nCode\n# multiple leads values for a single time series\ndf \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = (1, 3)\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_lead_1\nvalue_lead_2\nvalue_lead_3\n\n\n\n\n0\n2023-09-18\n25\n50.0\n49.0\n45.0\n\n\n1\n2023-09-19\n50\n49.0\n45.0\n48.0\n\n\n2\n2023-09-20\n49\n45.0\n48.0\n18.0\n\n\n3\n2023-09-21\n45\n48.0\n18.0\n18.0\n\n\n4\n2023-09-22\n48\n18.0\n18.0\nNaN\n\n\n5\n2023-09-23\n18\n18.0\nNaN\nNaN\n\n\n6\n2023-09-24\n18\nNaN\nNaN\nNaN" }, { - "objectID": "performance/01_speed_comparisons.html#what-affects-speed", - "href": "performance/01_speed_comparisons.html#what-affects-speed", - "title": "Speed Comparisons", - "section": "2.2 What affects speed?", - "text": "2.2 What affects speed?\nMany factors can affect speed. Things that are known to slow performance down:\n\nUsing non-optimized “lambda” functions. Lambda Functions are created at runtime. This process is flexible but extremely inefficient. Where possible use “built-in” or “configurable” functions instead.\nNot using polars. Polars is built on top of Rust, which is a low-level language known for performance and optimized for speed. Using polars usually speeds up computation versus Pandas." + "objectID": "guides/05_augmenting.html#augment-lags-leads-for-grouped-time-series", + "href": "guides/05_augmenting.html#augment-lags-leads-for-grouped-time-series", + "title": "Adding Features (Augmenting)", + "section": "1.2 Augment Lags / Leads For Grouped Time Series", + "text": "1.2 Augment Lags / Leads For Grouped Time Series\naugment_lags() and augment_leads() also works for grouped time series data. Lets use the m4_daily_df dataset to showcase examples:\n\n\nCode\n# load m4_daily_df\nm4_daily_df = tk.load_dataset('m4_daily', parse_dates = ['date'])\n\n\n\nLagLead\n\n\n\n\nCode\n# agument lags for grouped time series\nm4_daily_df \\\n .groupby(\"id\") \\\n .augment_lags(\n date_column = 'date',\n value_column = 'value',\n lags = (1, 7)\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_lag_1\nvalue_lag_2\nvalue_lag_3\nvalue_lag_4\nvalue_lag_5\nvalue_lag_6\nvalue_lag_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n4\nD10\n2014-07-07\n2006.4\n2048.9\n2048.7\n2073.4\n2076.2\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n9265.4\n9091.4\n\n\n9739\nD500\n2012-09-20\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n9265.4\n\n\n9740\nD500\n2012-09-21\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n9286.9\n\n\n9741\nD500\n2012-09-22\n9497.9\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n9359.2\n\n\n9742\nD500\n2012-09-23\n9545.3\n9497.9\n9445.9\n9365.7\n9418.8\n9431.9\n9437.7\n9474.6\n\n\n\n\n9743 rows × 10 columns\n\n\n\n\n\n\n\nCode\n# augment leads for grouped time series\nm4_daily_df \\\n .groupby(\"id\") \\\n .augment_leads(\n date_column = 'date',\n value_column = 'value',\n leads = (1, 7)\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_lead_1\nvalue_lead_2\nvalue_lead_3\nvalue_lead_4\nvalue_lead_5\nvalue_lead_6\nvalue_lead_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2073.4\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n\n\n1\nD10\n2014-07-04\n2073.4\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n\n\n3\nD10\n2014-07-06\n2048.9\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n1978.8\n\n\n4\nD10\n2014-07-07\n2006.4\n2017.6\n2019.1\n2007.4\n2010.0\n2001.5\n1978.8\n1988.3\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9365.7\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\n\n\n9739\nD500\n2012-09-20\n9365.7\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\n\n\n9740\nD500\n2012-09-21\n9445.9\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n9741\nD500\n2012-09-22\n9497.9\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n9742\nD500\n2012-09-23\n9545.3\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n9743 rows × 10 columns" }, { - "objectID": "performance/01_speed_comparisons.html#summarize-by-time-summarize_by_time", - "href": "performance/01_speed_comparisons.html#summarize-by-time-summarize_by_time", - "title": "Speed Comparisons", - "section": "4.1 Summarize By Time summarize_by_time()", - "text": "4.1 Summarize By Time summarize_by_time()\n\nPolars is 13.1X faster than Pandas\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 10\n\ndf_pytimetk = expedia_df[['site_name', 'date_time', 'cnt', 'is_booking']] \\\n .groupby('site_name') \\\n .summarize_by_time(\n date_column = 'date_time',\n value_column = ['cnt', 'is_booking'],\n freq = 'W',\n agg_func = ['sum', 'count'],\n engine = 'polars'\n )\n\n# 50.8 ms ± 2.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 10\n\ndf_pytimetk = expedia_df[['site_name', 'date_time', 'cnt', 'is_booking']] \\\n .groupby('site_name') \\\n .summarize_by_time(\n date_column = 'date_time',\n value_column = ['cnt', 'is_booking'],\n freq = 'W',\n agg_func = ['sum', 'count'],\n engine = 'pandas'\n )\n\n# 668 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)" + "objectID": "guides/05_augmenting.html#basic-examples-1", + "href": "guides/05_augmenting.html#basic-examples-1", + "title": "Adding Features (Augmenting)", + "section": "2.1 Basic Examples", + "text": "2.1 Basic Examples\nWe’ll continue with the use of our sample df created earlier:\n\n\nCode\n# window = 3 days, window function = mean\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean'\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\n41.333333\n\n\n3\n2023-09-21\n45\n48.000000\n\n\n4\n2023-09-22\n48\n47.333333\n\n\n5\n2023-09-23\n18\n37.000000\n\n\n6\n2023-09-24\n18\n28.000000\n\n\n\n\n\n\n\nIt is important to understand how the center parameter in augment_rolling() works.\n\n\n\n\n\n\ncenter\n\n\n\n\n\nWhen set to True (default) the value of the rolling window will be centered, meaning that the value at the center of the window will be used as the result. When set to False (default) the rolling window will not be centered, meaning that the value at the end of the window will be used as the result.\n\n\n\nLets see an example:\n\nAugment Rolling: Center = TrueAugment Rolling: Center = False\n\n\n\n\nCode\n# agument rolling: center = true\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean',\n center = True\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\n41.333333\n\n\n2\n2023-09-20\n49\n48.000000\n\n\n3\n2023-09-21\n45\n47.333333\n\n\n4\n2023-09-22\n48\n37.000000\n\n\n5\n2023-09-23\n18\n28.000000\n\n\n6\n2023-09-24\n18\nNaN\n\n\n\n\n\n\n\nNote that we are using a 3 day rolling window and applying a mean to value. In simplier terms, value_rolling_mean_win_3 is a 3 day rolling average of value with center set to True. Thus the function starts computing the mean from 2023-09-19\n\n\n\n\nCode\n# agument rolling: center = false\ndf \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = 3,\n window_func = 'mean',\n center = False\n )\n\n\n\n\n\n\n\n\n\ndate\nvalue\nvalue_rolling_mean_win_3\n\n\n\n\n0\n2023-09-18\n25\nNaN\n\n\n1\n2023-09-19\n50\nNaN\n\n\n2\n2023-09-20\n49\n41.333333\n\n\n3\n2023-09-21\n45\n48.000000\n\n\n4\n2023-09-22\n48\n47.333333\n\n\n5\n2023-09-23\n18\n37.000000\n\n\n6\n2023-09-24\n18\n28.000000\n\n\n\n\n\n\n\nNote that we are using a 3 day rolling window and applying a mean to value. In simplier terms, value_rolling_mean_win_3 is a 3 day rolling average of value with center set to False. Thus the function starts computing the mean from 2023-09-20. The same value for 2023-19-18 and 2023-09-19 are returned as value_rolling_mean_win_3 since it did not detected the third to apply the 3 day rolling average." }, { - "objectID": "performance/01_speed_comparisons.html#rolling-calculations-augment_rolling", - "href": "performance/01_speed_comparisons.html#rolling-calculations-augment_rolling", - "title": "Speed Comparisons", - "section": "4.2 Rolling Calculations augment_rolling()", - "text": "4.2 Rolling Calculations augment_rolling()\n\nPolars is 10.8X faster than Pandas\nPolars is 3,517X faster than Pandas with Lambdas\n\n\n\n\n\n\n\nPolarsPandasPandas (Lambda)\n\n\nUses pl_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars',\n )\n)\n# 9.81 ms ± 116 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n\n\n\n\nUses pd_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n # ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n ('quantile_75', pd_quantile(q=0.75))\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n show_progress = False,\n )\n)\n\n# 106 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\nUses lambda x: pd.Series(x).quantile(0.75). Lambda functions are extremely inefficient.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,10),\n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # lambda slows things down\n ],\n min_periods = 1,\n engine = 'pandas', \n show_progress = False,\n )\n)\n\n# 34.5 s ± 236 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" + "objectID": "guides/05_augmenting.html#augment-rolling-with-multiple-windows-and-window-functions", + "href": "guides/05_augmenting.html#augment-rolling-with-multiple-windows-and-window-functions", + "title": "Adding Features (Augmenting)", + "section": "2.2 Augment Rolling with Multiple Windows and Window Functions", + "text": "2.2 Augment Rolling with Multiple Windows and Window Functions\nMultiple window functions can be passed to the window and window_func parameters:\n\n\nCode\n# augment rolling: window of 2 & 7 days, window_func of mean and standard deviation\nm4_daily_df \\\n .query('id == \"D10\"') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = [2,7],\n window_func = ['mean', ('std', lambda x: x.std())]\n )\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n669\nD10\n2016-05-02\n2630.7\n2615.85\n14.85\n2579.471429\n28.868159\n\n\n670\nD10\n2016-05-03\n2649.3\n2640.00\n9.30\n2594.800000\n33.081631\n\n\n671\nD10\n2016-05-04\n2631.8\n2640.55\n8.75\n2601.371429\n35.145563\n\n\n672\nD10\n2016-05-05\n2622.5\n2627.15\n4.65\n2607.457143\n34.584508\n\n\n673\nD10\n2016-05-06\n2620.1\n2621.30\n1.20\n2618.328571\n22.923270\n\n\n\n\n674 rows × 7 columns" }, { - "objectID": "performance/01_speed_comparisons.html#augment-expanding-augment_expanding", - "href": "performance/01_speed_comparisons.html#augment-expanding-augment_expanding", - "title": "Speed Comparisons", - "section": "4.3 Augment Expanding augment_expanding()", - "text": "4.3 Augment Expanding augment_expanding()\n\nPolars is 3X faster than Pandas with built-in and configurable functions\nPolars is 515X faster than Pandas with lambda functions\n\n\n\n\n\n\n\nPolarsPandasPandas (Lambda)\n\n\nUses pl_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars',\n )\n)\n# 6.95 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n\n\n\n\nUses pd_quantile() configurable function.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n # ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n ('quantile_75', pd_quantile(q=0.75))\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n )\n)\n\n# 20.8 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n\n\n\n\nUses lambda x: pd.Series(x).quantile(0.75). Lambda functions are extremely inefficient.\n\n\nCode\n%%timeit\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # lambda slows things down\n ],\n min_periods = 1,\n engine = 'pandas', \n )\n)\n\n# 3.58 s ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)" + "objectID": "guides/05_augmenting.html#augment-rolling-with-grouped-time-series", + "href": "guides/05_augmenting.html#augment-rolling-with-grouped-time-series", + "title": "Adding Features (Augmenting)", + "section": "2.3 Augment Rolling with Grouped Time Series", + "text": "2.3 Augment Rolling with Grouped Time Series\nagument_rolling can be used on grouped time series data:\n\n\nCode\n## augment rolling on grouped time series: window of 2 & 7 days, window_func of mean and standard deviation\nm4_daily_df \\\n .groupby('id') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'value',\n window = [2,7],\n window_func = ['mean', ('std', lambda x: x.std())]\n )\n\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9425.35\n6.55\n9382.071429\n74.335988\n\n\n9739\nD500\n2012-09-20\n9365.7\n9392.25\n26.55\n9396.400000\n58.431303\n\n\n9740\nD500\n2012-09-21\n9445.9\n9405.80\n40.10\n9419.114286\n39.184451\n\n\n9741\nD500\n2012-09-22\n9497.9\n9471.90\n26.00\n9438.928571\n38.945336\n\n\n9742\nD500\n2012-09-23\n9545.3\n9521.60\n23.70\n9449.028571\n53.379416\n\n\n\n\n9743 rows × 7 columns" }, { - "objectID": "performance/01_speed_comparisons.html#augment-lags-augment_lags", - "href": "performance/01_speed_comparisons.html#augment-lags-augment_lags", - "title": "Speed Comparisons", - "section": "4.4 Augment Lags augment_lags()", - "text": "4.4 Augment Lags augment_lags()\n\nPolars is 1.9X faster than Pandas\nSpeed improvement of Polars (vs Pandas) increases with number of lags\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 25\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_lags(\n date_column = 'date', \n value_column = 'value', \n lags = (2,30),\n engine = 'polars', \n )\n)\n\n# 37.7 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 25\n\nexpanded_df = (\n m4_daily_df\n .groupby('id')\n .augment_lags(\n date_column = 'date', \n value_column = 'value', \n lags = (2,30),\n engine = 'pandas', \n )\n)\n\n# 73.3 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)" + "objectID": "guides/05_augmenting.html#basic-example", + "href": "guides/05_augmenting.html#basic-example", + "title": "Adding Features (Augmenting)", + "section": "3.1 Basic Example", + "text": "3.1 Basic Example\nWe’ll showcase an example using the m4_daily_df dataset by generating 29 additional features from the date column:\n\n\nCode\n# augment time series signature\nm4_daily_df \\\n .query('id == \"D10\"') \\\n .augment_timeseries_signature(\n date_column = 'date'\n ) \\\n .head()\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\ndate_index_num\ndate_year\ndate_year_iso\ndate_yearstart\ndate_yearend\ndate_leapyear\ndate_half\n...\ndate_mday\ndate_qday\ndate_yday\ndate_weekend\ndate_hour\ndate_minute\ndate_second\ndate_msecond\ndate_nsecond\ndate_am_pm\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n1404345600\n2014\n2014\n0\n0\n0\n2\n...\n3\n3\n184\n0\n0\n0\n0\n0\n0\nam\n\n\n1\nD10\n2014-07-04\n2073.4\n1404432000\n2014\n2014\n0\n0\n0\n2\n...\n4\n4\n185\n0\n0\n0\n0\n0\n0\nam\n\n\n2\nD10\n2014-07-05\n2048.7\n1404518400\n2014\n2014\n0\n0\n0\n2\n...\n5\n5\n186\n0\n0\n0\n0\n0\n0\nam\n\n\n3\nD10\n2014-07-06\n2048.9\n1404604800\n2014\n2014\n0\n0\n0\n2\n...\n6\n6\n187\n1\n0\n0\n0\n0\n0\nam\n\n\n4\nD10\n2014-07-07\n2006.4\n1404691200\n2014\n2014\n0\n0\n0\n2\n...\n7\n7\n188\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n5 rows × 32 columns" + }, + { + "objectID": "guides/05_augmenting.html#basic-example-1", + "href": "guides/05_augmenting.html#basic-example-1", + "title": "Adding Features (Augmenting)", + "section": "4.1 Basic Example", + "text": "4.1 Basic Example\nWe’ll showcase an example using some sample data:\n\n\nCode\n# create sample data\ndates = pd.date_range(start = '2022-12-25', end = '2023-01-05')\n\ndf = pd.DataFrame({'date': dates})\n\n# augment time series signature: USA\ndf \\\n .augment_holiday_signature(\n date_column = 'date',\n country_name = 'UnitedStates'\n )\n\n\n\n\n\n\n\n\n\ndate\nis_holiday\nbefore_holiday\nafter_holiday\nholiday_name\n\n\n\n\n0\n2022-12-25\n1\n1\n0\nChristmas Day\n\n\n1\n2022-12-26\n1\n0\n1\nChristmas Day (Observed)\n\n\n2\n2022-12-27\n0\n0\n1\nNaN\n\n\n3\n2022-12-28\n0\n0\n0\nNaN\n\n\n4\n2022-12-29\n0\n0\n0\nNaN\n\n\n5\n2022-12-30\n0\n0\n0\nNaN\n\n\n6\n2022-12-31\n0\n1\n0\nNaN\n\n\n7\n2023-01-01\n1\n1\n0\nNew Year's Day\n\n\n8\n2023-01-02\n1\n0\n1\nNew Year's Day (Observed)\n\n\n9\n2023-01-03\n0\n0\n1\nNaN\n\n\n10\n2023-01-04\n0\n0\n0\nNaN\n\n\n11\n2023-01-05\n0\n0\n0\nNaN" + }, + { + "objectID": "guides/05_augmenting.html#basic-example-2", + "href": "guides/05_augmenting.html#basic-example-2", + "title": "Adding Features (Augmenting)", + "section": "5.1 Basic Example", + "text": "5.1 Basic Example\n\n\nCode\n# augment fourier with 7 periods and max order of 1\n#m4_daily_df \\\n# .query('id == \"D10\"') \\\n# .augment_fourier(\n# date_column = 'date',\n# value_column = 'value',\n# num_periods = 7,\n# max_order = 1\n# ) \\\n# .head(20)\n\n\nNotice the additional value_fourier_1_1 to value_fourier_1_7 colums that have been added to the data." + }, + { + "objectID": "guides/05_augmenting.html#augment-fourier-with-grouped-time-series", + "href": "guides/05_augmenting.html#augment-fourier-with-grouped-time-series", + "title": "Adding Features (Augmenting)", + "section": "5.2 Augment Fourier with Grouped Time Series", + "text": "5.2 Augment Fourier with Grouped Time Series\naugment_fourier also works with grouped time series:\n\n\nCode\n# augment fourier with grouped time series\nm4_daily_df \\\n .groupby('id') \\\n .augment_fourier(\n date_column = 'date',\n value_column = 'value',\n num_periods = 7,\n max_order = 1\n ) \\\n .head(20)\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_fourier_1_1\nvalue_fourier_1_2\nvalue_fourier_1_3\nvalue_fourier_1_4\nvalue_fourier_1_5\nvalue_fourier_1_6\nvalue_fourier_1_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n0.394510\n-0.725024\n0.937927\n-0.998682\n0.897435\n-0.650609\n0.298243\n\n\n1\nD10\n2014-07-04\n2073.4\n-0.980653\n0.383931\n0.830342\n-0.709015\n-0.552759\n0.925423\n0.190450\n\n\n2\nD10\n2014-07-05\n2048.7\n0.011484\n0.022967\n0.034446\n0.045921\n0.057390\n0.068852\n0.080304\n\n\n3\nD10\n2014-07-06\n2048.9\n0.975899\n-0.425928\n-0.790004\n0.770723\n0.453624\n-0.968706\n-0.030835\n\n\n4\nD10\n2014-07-07\n2006.4\n-0.415510\n0.755886\n-0.959581\n0.989762\n-0.840972\n0.540115\n-0.141593\n\n\n5\nD10\n2014-07-08\n2017.6\n-0.803876\n-0.956286\n-0.333715\n0.559301\n0.999055\n0.629169\n-0.250600\n\n\n6\nD10\n2014-07-09\n2019.1\n0.748318\n0.992779\n0.568784\n-0.238184\n-0.884778\n-0.935635\n-0.356511\n\n\n7\nD10\n2014-07-10\n2007.4\n0.494070\n-0.859111\n0.999790\n-0.879368\n0.529294\n-0.040992\n-0.458015\n\n\n8\nD10\n2014-07-11\n2010.0\n-0.952864\n0.578192\n0.602021\n-0.943494\n-0.029515\n0.961404\n-0.553858\n\n\n9\nD10\n2014-07-12\n2001.5\n-0.099581\n-0.198171\n-0.294792\n-0.388482\n-0.478310\n-0.563384\n-0.642856\n\n\n10\nD10\n2014-07-13\n1978.8\n0.994091\n-0.215816\n-0.947238\n0.421459\n0.855740\n-0.607239\n-0.723909\n\n\n11\nD10\n2014-07-14\n1988.3\n-0.311977\n0.592812\n-0.814472\n0.954831\n-0.999879\n0.945118\n-0.796015\n\n\n12\nD10\n2014-07-15\n2000.7\n-0.864932\n-0.868201\n-0.006551\n0.861625\n0.871433\n0.013101\n-0.858282\n\n\n13\nD10\n2014-07-16\n2010.5\n0.670062\n0.994781\n0.806801\n0.203005\n-0.505418\n-0.953354\n-0.909941\n\n\n14\nD10\n2014-07-17\n2014.5\n0.587524\n-0.950856\n0.951356\n-0.588831\n0.001617\n0.586214\n-0.950354\n\n\n15\nD10\n2014-07-18\n1962.6\n-0.913299\n0.743956\n0.307286\n-0.994265\n0.502625\n0.584837\n-0.979022\n\n\n16\nD10\n2014-07-19\n1948.0\n-0.209415\n-0.409542\n-0.591509\n-0.747244\n-0.869842\n-0.953865\n-0.995589\n\n\n17\nD10\n2014-07-20\n1943.0\n0.999997\n0.004934\n-0.999973\n-0.009867\n0.999924\n0.014800\n-0.999851\n\n\n18\nD10\n2014-07-21\n1933.3\n-0.204588\n0.400521\n-0.579511\n0.733985\n-0.857409\n0.944561\n-0.991756\n\n\n19\nD10\n2014-07-22\n1891.0\n-0.915297\n-0.737326\n0.321336\n0.996182\n0.481148\n-0.608588\n-0.971403" + }, + { + "objectID": "tutorials/06_correlationfunnel.html", + "href": "tutorials/06_correlationfunnel.html", + "title": "Correlation Funnel", + "section": "", + "text": "We will demonstrate how Correlation Funnel to analyze Expedia Hotel Bookings and which features correlate to a customer making a booking through their website:\n\n\n\nCorrelation Funnel" + }, + { + "objectID": "tutorials/06_correlationfunnel.html#setup", + "href": "tutorials/06_correlationfunnel.html#setup", + "title": "Correlation Funnel", + "section": "3.1 Setup", + "text": "3.1 Setup\nTo set up, import the following packages and the expedia_df dataset, Expedia Hotel Time Series Dataset.\n\n# Libraries\nimport pandas as pd \nimport pytimetk as tk\n\n# Data\nexpedia_df = tk.load_dataset(\"expedia\", parse_dates = ['date_time'])\nexpedia_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 24 columns\ndate_time: datetime64[ns] [Timestamp('2013-07-25 17: ...\nsite_name: int64 [2, 2, 2, 2, 2, 37, 2, 2, ...\nposa_continent: int64 [3, 3, 3, 3, 3, 1, 3, 3, 3 ...\nuser_location_country: int64 [66, 66, 66, 66, 66, 69, 6 ...\nuser_location_region: int64 [174, 174, 174, 220, 351, ...\nuser_location_city: int64 [35675, 31320, 16292, 1760 ...\norig_destination_distance: float64 [0.1203, 108.2251, 763.142 ...\nuser_id: int64 [44735, 794319, 761732, 69 ...\nis_mobile: int64 [0, 0, 1, 0, 0, 0, 0, 0, 0 ...\nis_package: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0 ...\nchannel: int64 [9, 3, 1, 9, 1, 9, 9, 9, 9 ...\nsrch_ci: object ['2013-07-26', '2014-11-27 ...\nsrch_co: object ['2013-07-27', '2014-11-29 ...\nsrch_adults_cnt: int64 [1, 2, 2, 2, 2, 2, 2, 2, 2 ...\nsrch_children_cnt: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0 ...\nsrch_rm_cnt: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1 ...\nsrch_destination_id: int64 [5465, 11620, 23808, 40658 ...\nsrch_destination_type_id: int64 [3, 1, 6, 5, 1, 6, 1, 5, 6 ...\nis_booking: int64 [1, 0, 0, 0, 0, 0, 0, 0, 0 ...\ncnt: int64 [1, 2, 3, 1, 2, 7, 1, 1, 1 ...\nhotel_continent: int64 [2, 2, 2, 2, 2, 6, 4, 2, 4 ...\nhotel_country: int64 [50, 50, 50, 50, 50, 204, ...\nhotel_market: int64 [1230, 369, 1144, 930, 637 ...\nhotel_cluster: int64 [47, 83, 93, 48, 33, 15, 9 ..." + }, + { + "objectID": "tutorials/06_correlationfunnel.html#data-preparation", + "href": "tutorials/06_correlationfunnel.html#data-preparation", + "title": "Correlation Funnel", + "section": "3.2 Data Preparation", + "text": "3.2 Data Preparation\nTo prepare the dataset, we will first perform data preparation:\n\nAdd time series features based on the date_time timestamp column.\nWe will drop any zero variance features\nDrop additional columns that are not an acceptable data type (i.e. not numeric, categorical, or string) or contain missing values\nConvert numeric columns that start with “hotel_” that are actually categorical “ID” columns to string\n\n\nexpedia_ts_features_df = expedia_df \\\n .augment_timeseries_signature('date_time') \\\n .drop_zero_variance() \\\n .drop(columns=['date_time', 'orig_destination_distance', 'srch_ci', 'srch_co']) \\\n .transform_columns(\n columns = [r\"hotel_.*\"],\n transform_func = lambda x: x.astype(str)\n )\n \nexpedia_ts_features_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 46 columns\nsite_name: int64 [2, 2, 2, 2, 2, 37, 2, 2, 2 ...\nposa_continent: int64 [3, 3, 3, 3, 3, 1, 3, 3, 3, ...\nuser_location_country: int64 [66, 66, 66, 66, 66, 69, 66 ...\nuser_location_region: int64 [174, 174, 174, 220, 351, 7 ...\nuser_location_city: int64 [35675, 31320, 16292, 17605 ...\nuser_id: int64 [44735, 794319, 761732, 696 ...\nis_mobile: int64 [0, 0, 1, 0, 0, 0, 0, 0, 0, ...\nis_package: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nchannel: int64 [9, 3, 1, 9, 1, 9, 9, 9, 9, ...\nsrch_adults_cnt: int64 [1, 2, 2, 2, 2, 2, 2, 2, 2, ...\nsrch_children_cnt: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nsrch_rm_cnt: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nsrch_destination_id: int64 [5465, 11620, 23808, 40658, ...\nsrch_destination_type_id: int64 [3, 1, 6, 5, 1, 6, 1, 5, 6, ...\nis_booking: int64 [1, 0, 0, 0, 0, 0, 0, 0, 0, ...\ncnt: int64 [1, 2, 3, 1, 2, 7, 1, 1, 1, ...\nhotel_continent: object ['2', '2', '2', '2', '2', ' ...\nhotel_country: object ['50', '50', '50', '50', '5 ...\nhotel_market: object ['1230', '369', '1144', '93 ...\nhotel_cluster: object ['47', '83', '93', '48', '3 ...\ndate_time_index_num: int64 [1374773055, 1414939784, 14 ...\ndate_time_year: int64 [2013, 2014, 2014, 2014, 20 ...\ndate_time_year_iso: UInt32 [2013, 2014, 2014, 2014, 20 ...\ndate_time_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_half: int64 [2, 2, 1, 1, 2, 2, 1, 2, 1, ...\ndate_time_quarter: int64 [3, 4, 2, 1, 3, 4, 1, 3, 2, ...\ndate_time_quarteryear: object ['2013Q3', '2014Q4', '2014Q ...\ndate_time_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_month: int64 [7, 11, 5, 2, 8, 12, 3, 9, ...\ndate_time_month_lbl: object ['July', 'November', 'May', ...\ndate_time_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_monthend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_yweek: UInt32 [30, 44, 21, 9, 33, 50, 12, ...\ndate_time_mweek: int64 [4, 1, 4, 4, 2, 2, 3, 3, 2, ...\ndate_time_wday: int64 [4, 7, 4, 3, 3, 2, 2, 1, 4, ...\ndate_time_wday_lbl: object ['Thursday', 'Sunday', 'Thu ...\ndate_time_mday: int64 [25, 2, 22, 26, 13, 9, 18, ...\ndate_time_qday: int64 [25, 33, 52, 57, 44, 70, 77 ...\ndate_time_yday: int64 [206, 306, 142, 57, 225, 34 ...\ndate_time_weekend: int64 [0, 1, 0, 0, 0, 0, 0, 0, 0, ...\ndate_time_hour: int64 [17, 14, 12, 14, 11, 7, 21, ...\ndate_time_minute: int64 [24, 49, 50, 1, 15, 21, 40, ...\ndate_time_second: int64 [15, 44, 53, 2, 40, 31, 29, ...\ndate_time_am_pm: object ['pm', 'pm', 'am', 'pm', 'a ..." + }, + { + "objectID": "tutorials/06_correlationfunnel.html#step-correlation-funnel-workflow", + "href": "tutorials/06_correlationfunnel.html#step-correlation-funnel-workflow", + "title": "Correlation Funnel", + "section": "3.3 3-Step Correlation Funnel Workflow", + "text": "3.3 3-Step Correlation Funnel Workflow\nNext, we will perform the Correlation Funnel workflow to explore the Expedia Hotel Time Series dataset. There are 3 steps:\n\nBinarize: Convert the data to binary 0/1\nCorrelate: Detect relationships between the binary features and one of the columns (called the target)\nVisualize the Correlation Funnel: Plotting allows us to assess the top features and their relationship to the target.\n\n\nStep 1: Binarize\nUse binarize() to convert the raw data to binary 0/1. Binarization happens as follows:\n\nNumeric Data: Numeric data is Quantile Binned using the pd.qcut() function. The default is 4 bins, which bins numeric data into a maximum of 4 discrete bins. Fewer bins can be returned if there is insufficient data for 4 bins. The number of bins is controlled with the n_bins parameter.\nCategorical / String Data: Categorical data is first processed to determine the most frequent categories. Categories that are sparse are lumped into an “OTHER” category. The lumping can be controlled with the thresh_infreq.\n\n\nexpedia_ts_binarized_df = expedia_ts_features_df.binarize(thresh_infreq = 0.05)\n\nexpedia_ts_binarized_df.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 100000 rows of 155 columns\nsite_name__2.0_15.0: uint8 [1, 1 ...\nsite_name__15.0_53.0: uint8 [0, 0 ...\nuser_location_country__0.0_66.0: uint8 [1, 1 ...\nuser_location_country__66.0_71.0: uint8 [0, 0 ...\nuser_location_country__71.0_239.0: uint8 [0, 0 ...\nuser_location_region__0.0_174.0: uint8 [1, 1 ...\nuser_location_region__174.0_314.0: uint8 [0, 0 ...\nuser_location_region__314.0_385.0: uint8 [0, 0 ...\nuser_location_region__385.0_1021.0: uint8 [0, 0 ...\nuser_location_city__0.0_13087.0: uint8 [0, 0 ...\nuser_location_city__13087.0_27655.0: uint8 [0, 0 ...\nuser_location_city__27655.0_42563.0: uint8 [1, 1 ...\nuser_location_city__42563.0_56507.0: uint8 [0, 0 ...\nuser_id__13.0_299759.8: uint8 [1, 0 ...\nuser_id__299759.8_605161.5: uint8 [0, 0 ...\nuser_id__605161.5_911811.5: uint8 [0, 1 ...\nuser_id__911811.5_1198780.0: uint8 [0, 0 ...\nchannel__0.0_2.0: uint8 [0, 0 ...\nchannel__2.0_9.0: uint8 [1, 1 ...\nchannel__9.0_10.0: uint8 [0, 0 ...\nsrch_adults_cnt__0.0_2.0: uint8 [1, 1 ...\nsrch_adults_cnt__2.0_9.0: uint8 [0, 0 ...\nsrch_children_cnt__0.0_9.0: uint8 [1, 1 ...\nsrch_rm_cnt__0.0_1.0: uint8 [1, 1 ...\nsrch_rm_cnt__1.0_8.0: uint8 [0, 0 ...\nsrch_destination_id__1.0_8267.0: uint8 [1, 0 ...\nsrch_destination_id__8267.0_9147.0: uint8 [0, 0 ...\nsrch_destination_id__9147.0_18998.0: uint8 [0, 1 ...\nsrch_destination_id__18998.0_65104.0: uint8 [0, 0 ...\nsrch_destination_type_id__1.0_5.0: uint8 [1, 1 ...\nsrch_destination_type_id__5.0_9.0: uint8 [0, 0 ...\ncnt__1.0_2.0: uint8 [1, 1 ...\ncnt__2.0_72.0: uint8 [0, 0 ...\ndate_time_index_num__1357516842.0_1382867237.5: uint8 [1, 0 ...\ndate_time_index_num__1382867237.5_1401387689.0: uint8 [0, 0 ...\ndate_time_index_num__1401387689.0_1410981206.0: uint8 [0, 0 ...\ndate_time_index_num__1410981206.0_1420070302.0: uint8 [0, 1 ...\ndate_time_month__1.0_5.0: uint8 [0, 0 ...\ndate_time_month__5.0_7.0: uint8 [1, 0 ...\ndate_time_month__7.0_10.0: uint8 [0, 0 ...\ndate_time_month__10.0_12.0: uint8 [0, 1 ...\ndate_time_yweek__1.0_17.0: uint8 [0, 0 ...\ndate_time_yweek__17.0_30.0: uint8 [1, 0 ...\ndate_time_yweek__30.0_41.0: uint8 [0, 0 ...\ndate_time_yweek__41.0_52.0: uint8 [0, 1 ...\ndate_time_mday__1.0_8.0: uint8 [0, 1 ...\ndate_time_mday__8.0_16.0: uint8 [0, 0 ...\ndate_time_mday__16.0_23.0: uint8 [0, 0 ...\ndate_time_mday__23.0_31.0: uint8 [1, 0 ...\ndate_time_qday__1.0_24.0: uint8 [0, 0 ...\ndate_time_qday__24.0_48.0: uint8 [1, 1 ...\ndate_time_qday__48.0_70.0: uint8 [0, 0 ...\ndate_time_qday__70.0_92.0: uint8 [0, 0 ...\ndate_time_yday__1.0_121.0: uint8 [0, 0 ...\ndate_time_yday__121.0_209.0: uint8 [1, 0 ...\ndate_time_yday__209.0_286.0: uint8 [0, 0 ...\ndate_time_yday__286.0_365.0: uint8 [0, 1 ...\ndate_time_hour__0.0_10.0: uint8 [0, 0 ...\ndate_time_hour__10.0_14.0: uint8 [0, 1 ...\ndate_time_hour__14.0_18.0: uint8 [1, 0 ...\ndate_time_hour__18.0_23.0: uint8 [0, 0 ...\ndate_time_minute__0.0_15.0: uint8 [0, 0 ...\ndate_time_minute__15.0_30.0: uint8 [1, 0 ...\ndate_time_minute__30.0_45.0: uint8 [0, 0 ...\ndate_time_minute__45.0_59.0: uint8 [0, 1 ...\ndate_time_second__0.0_15.0: uint8 [1, 0 ...\ndate_time_second__15.0_30.0: uint8 [0, 0 ...\ndate_time_second__30.0_45.0: uint8 [0, 1 ...\ndate_time_second__45.0_59.0: uint8 [0, 0 ...\nposa_continent__1: uint8 [0, 0 ...\nposa_continent__2: uint8 [0, 0 ...\nposa_continent__3: uint8 [1, 1 ...\nposa_continent__-OTHER: uint8 [0, 0 ...\nis_mobile__0: uint8 [1, 1 ...\nis_mobile__1: uint8 [0, 0 ...\nis_package__0: uint8 [1, 1 ...\nis_package__1: uint8 [0, 0 ...\nis_booking__0: uint8 [0, 1 ...\nis_booking__1: uint8 [1, 0 ...\nhotel_continent__-OTHER: uint8 [0, 0 ...\nhotel_continent__2: uint8 [1, 1 ...\nhotel_continent__3: uint8 [0, 0 ...\nhotel_continent__4: uint8 [0, 0 ...\nhotel_continent__6: uint8 [0, 0 ...\nhotel_country__-OTHER: uint8 [0, 0 ...\nhotel_country__50: uint8 [1, 1 ...\nhotel_country__8: uint8 [0, 0 ...\nhotel_market__-OTHER: uint8 [1, 1 ...\nhotel_cluster__-OTHER: uint8 [1, 1 ...\ndate_time_year__2013: uint8 [1, 0 ...\ndate_time_year__2014: uint8 [0, 1 ...\ndate_time_year_iso__2013: uint8 [1, 0 ...\ndate_time_year_iso__2014: uint8 [0, 1 ...\ndate_time_year_iso__-OTHER: uint8 [0, 0 ...\ndate_time_yearstart__0: uint8 [1, 1 ...\ndate_time_yearstart__-OTHER: uint8 [0, 0 ...\ndate_time_yearend__0: uint8 [1, 1 ...\ndate_time_yearend__-OTHER: uint8 [0, 0 ...\ndate_time_half__1: uint8 [0, 0 ...\ndate_time_half__2: uint8 [1, 1 ...\ndate_time_quarter__1: uint8 [0, 0 ...\ndate_time_quarter__2: uint8 [0, 0 ...\ndate_time_quarter__3: uint8 [1, 0 ...\ndate_time_quarter__4: uint8 [0, 1 ...\ndate_time_quarteryear__2013Q1: uint8 [0, 0 ...\ndate_time_quarteryear__2013Q2: uint8 [0, 0 ...\ndate_time_quarteryear__2013Q3: uint8 [1, 0 ...\ndate_time_quarteryear__2013Q4: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q1: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q2: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q3: uint8 [0, 0 ...\ndate_time_quarteryear__2014Q4: uint8 [0, 1 ...\ndate_time_quarterstart__0: uint8 [1, 1 ...\ndate_time_quarterstart__-OTHER: uint8 [0, 0 ...\ndate_time_quarterend__0: uint8 [1, 1 ...\ndate_time_quarterend__-OTHER: uint8 [0, 0 ...\ndate_time_month_lbl__April: uint8 [0, 0 ...\ndate_time_month_lbl__August: uint8 [0, 0 ...\ndate_time_month_lbl__December: uint8 [0, 0 ...\ndate_time_month_lbl__February: uint8 [0, 0 ...\ndate_time_month_lbl__January: uint8 [0, 0 ...\ndate_time_month_lbl__July: uint8 [1, 0 ...\ndate_time_month_lbl__June: uint8 [0, 0 ...\ndate_time_month_lbl__March: uint8 [0, 0 ...\ndate_time_month_lbl__May: uint8 [0, 0 ...\ndate_time_month_lbl__November: uint8 [0, 1 ...\ndate_time_month_lbl__October: uint8 [0, 0 ...\ndate_time_month_lbl__September: uint8 [0, 0 ...\ndate_time_monthstart__0: uint8 [1, 1 ...\ndate_time_monthstart__-OTHER: uint8 [0, 0 ...\ndate_time_monthend__0: uint8 [1, 1 ...\ndate_time_monthend__-OTHER: uint8 [0, 0 ...\ndate_time_mweek__1: uint8 [0, 1 ...\ndate_time_mweek__2: uint8 [0, 0 ...\ndate_time_mweek__3: uint8 [0, 0 ...\ndate_time_mweek__4: uint8 [1, 0 ...\ndate_time_mweek__5: uint8 [0, 0 ...\ndate_time_wday__1: uint8 [0, 0 ...\ndate_time_wday__2: uint8 [0, 0 ...\ndate_time_wday__3: uint8 [0, 0 ...\ndate_time_wday__4: uint8 [1, 0 ...\ndate_time_wday__5: uint8 [0, 0 ...\ndate_time_wday__6: uint8 [0, 0 ...\ndate_time_wday__7: uint8 [0, 1 ...\ndate_time_wday_lbl__Friday: uint8 [0, 0 ...\ndate_time_wday_lbl__Monday: uint8 [0, 0 ...\ndate_time_wday_lbl__Saturday: uint8 [0, 0 ...\ndate_time_wday_lbl__Sunday: uint8 [0, 1 ...\ndate_time_wday_lbl__Thursday: uint8 [1, 0 ...\ndate_time_wday_lbl__Tuesday: uint8 [0, 0 ...\ndate_time_wday_lbl__Wednesday: uint8 [0, 0 ...\ndate_time_weekend__0: uint8 [1, 0 ...\ndate_time_weekend__1: uint8 [0, 1 ...\ndate_time_am_pm__am: uint8 [0, 0 ...\ndate_time_am_pm__pm: uint8 [1, 1 ...\n\n\n\n\nStep 2: Correlate the data\nNext, we use correlate() to calculate strength of the relationship. The main parameter is target, which should be selected based on the business goal.\nIn this case, we can create a business goal to understand what relates to a website visit count greater than 2. We will select the column: is_booking__1 as the target. This is because we want to know what relates to a hotel room booking via the website search data.\nThis returns a 3 column data frame containing:\n\nfeature: The name of the features\nbin: The bin that corresponds to a bin inside the features\ncorrelation: The strength of the relationship (0 to 1) and the direction of the relationship (+/-)\n\n\nexpedia_ts_correlate_df = expedia_ts_binarized_df.correlate('is_booking__1')\n\nexpedia_ts_correlate_df\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n77\nis_booking\n0\n-1.000000\n\n\n78\nis_booking\n1\n1.000000\n\n\n32\ncnt\n2.0_72.0\n-0.099372\n\n\n31\ncnt\n1.0_2.0\n0.099372\n\n\n75\nis_package\n0\n0.075930\n\n\n...\n...\n...\n...\n\n\n131\ndate_time_monthend\n-OTHER\n0.000182\n\n\n108\ndate_time_quarteryear\n2014Q1\n-0.000041\n\n\n22\nsrch_children_cnt\n0.0_9.0\nNaN\n\n\n87\nhotel_market\n-OTHER\nNaN\n\n\n88\nhotel_cluster\n-OTHER\nNaN\n\n\n\n\n155 rows × 3 columns\n\n\n\n\n\nStep 3: Plot the Correlation funnel\nIt’s in this step where we can visualize review the correlations and determine which features relate to the target, the strength of the relationship (magnitude between 0 and 1), and the direction of the relationship (+/-).\n\nexpedia_ts_correlate_df.plot_correlation_funnel(\n engine = 'plotly',\n height = 800\n)" + }, + { + "objectID": "tutorials/03_demand_forecasting.html", + "href": "tutorials/03_demand_forecasting.html", + "title": "Demand Forecasting", + "section": "", + "text": "Timetk enables you to generate features from the time column of your data very easily. This tutorial showcases how easy it is to perform time series forecasting with pytimetk. The specific methods we will be using are:" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#load-packages", + "href": "tutorials/03_demand_forecasting.html#load-packages", + "title": "Demand Forecasting", + "section": "1.1 Load Packages", + "text": "1.1 Load Packages\nLoad the following packages before proceeding with this tutorial.\n\n\nCode\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\nfrom sklearn.ensemble import RandomForestRegressor\n\n\nThe tutorial is divided into three parts: We will first have a look at the Walmart dataset and perform some preprocessing. Secondly, we will create models based on different features, and see how the time features can be useful. Finally, we will solve the task of time series forecasting, using the features from augment_timeseries_signature, augment_lags, and augment_rolling, to predict future sales." + }, + { + "objectID": "tutorials/03_demand_forecasting.html#load-inspect-dataset", + "href": "tutorials/03_demand_forecasting.html#load-inspect-dataset", + "title": "Demand Forecasting", + "section": "1.2 Load & Inspect dataset", + "text": "1.2 Load & Inspect dataset\nThe first thing we want to do is to load the dataset. It is a subset of the Walmart sales prediction Kaggle competition. You can get more insights about the dataset by following this link: walmart_sales_weekly. The most important thing to know about the dataset is that you are provided with some features like the fuel price or whether the week contains holidays and you are expected to predict the weekly sales column for 7 different departments of a given store. Of course, you also have the date for each week, and that is what we can leverage to create additional features.\nLet us start by loading the dataset and cleaning it. Note that we also removed some columns due to * duplication of data * 0 variance * No future data available in current dataset.\n\n\nCode\n# We start by loading the dataset\n# /walmart_sales_weekly.html\ndset = tk.load_dataset('walmart_sales_weekly', parse_dates = ['Date'])\n\ndset = dset.drop(columns=[\n 'id', # This column can be removed as it is equivalent to 'Dept'\n 'Store', # This column has only one possible value\n 'Type', # This column has only one possible value\n 'Size', # This column has only one possible value\n 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',\n 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI',\n 'Unemployment'])\n\ndset.head()\n\n\n\n\n\n\n\n\n\nDept\nDate\nWeekly_Sales\n\n\n\n\n0\n1\n2010-02-05\n24924.50\n\n\n1\n1\n2010-02-12\n46039.49\n\n\n2\n1\n2010-02-19\n41595.55\n\n\n3\n1\n2010-02-26\n19403.54\n\n\n4\n1\n2010-03-05\n21827.90\n\n\n\n\n\n\n\nWe can plot the values of each department to get an idea of how the data looks like. Using the plot_timeseries method with a groupby allows us to create multiple plots by group.\n\n\n\n\n\n\nGetting More Info: tk.plot_timeseries()\n\n\n\n\n\n\nClick here to see our Data Visualization Guide\nUse help(tk.plot_timeseries) to review additional helpful documentation.\n\n\n\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nsales_df = dset\nfig = sales_df.groupby('Dept').plot_timeseries(\n date_column='Date',\n value_column='Weekly_Sales',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly')\nfig\n\n\n\n \n\n\n\n\n\n\nCode\nfig = sales_df.groupby('Dept').plot_timeseries(\n date_column='Date',\n value_column='Weekly_Sales',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine')\nfig\n\n\n\n\n\n<Figure Size: (700 x 500)>" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#making-future-dates-easier-with-tk.future_frame", + "href": "tutorials/03_demand_forecasting.html#making-future-dates-easier-with-tk.future_frame", + "title": "Demand Forecasting", + "section": "2.1 Making Future Dates Easier with tk.future_frame", + "text": "2.1 Making Future Dates Easier with tk.future_frame\nWhen building machine learning models, we need to setup our dataframe to hold information about the future. This is the dataframe that will get passed to our model.predict() call. This is made easy with tk.future_frame().\n\n\n\n\n\n\nGetting to know tk.future_frame()\n\n\n\n\n\nCurious about the various options it provides?\n\nClick here to see our Data Wrangling Guide\nUse help(tk.future_frame) to review additional helpful documentation. And explore the plethora of possibilities!\n\n\n\n\nNotice this function adds 5 weeks to our dateset for each department and fills in weekly sales with nulls. Previously our max date was 2012-10-26.\n\n\nCode\nprint(sales_df.groupby('Dept').Date.max())\n\n\nDept\n1 2012-10-26\n3 2012-10-26\n8 2012-10-26\n13 2012-10-26\n38 2012-10-26\n93 2012-10-26\n95 2012-10-26\nName: Date, dtype: datetime64[ns]\n\n\nAfter applying our future frame, we can now see values 5 weeks in the future, and our dataframe has been extended to 2012-11-30 for all groups.\n\n\nCode\nsales_df_with_futureframe = sales_df \\\n .groupby('Dept') \\\n .future_frame(\n date_column = 'Date',\n length_out = 5\n )\n\n\n\n\n\n\n\nCode\nsales_df_with_futureframe.groupby('Dept').Date.max()\n\n\nDept\n1 2012-11-30\n3 2012-11-30\n8 2012-11-30\n13 2012-11-30\n38 2012-11-30\n93 2012-11-30\n95 2012-11-30\nName: Date, dtype: datetime64[ns]" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#date-features-with-tk.augment_timeseries_signature", + "href": "tutorials/03_demand_forecasting.html#date-features-with-tk.augment_timeseries_signature", + "title": "Demand Forecasting", + "section": "2.2 Date Features with tk.augment_timeseries_signature", + "text": "2.2 Date Features with tk.augment_timeseries_signature\nMachine Learning models generally cannot process raw date objects directly. Moreover, they lack an inherent understanding of the passage of time. This means that, without specific features, a model can’t differentiate between a January observation and a June one. To bridge this gap, the tk.augment_timeseries_signature function is invaluable. It generates 29 distinct date-oriented features suitable for model inputs.\n\n\n\n\n\n\nGetting More Info: tk.augment_timeseries_signature(),tk.augment_lags(), tk.augment_rolling()\n\n\n\n\n\n\nClick here to see our Adding Features (Augmenting)\nUse help(tk.augment_timeseries_signature) help(tk.augment_lags) help(tk.augment_rolling) to review additional helpful documentation.\n\n\n\n\n\nIt’s crucial, however, to align these features with the granularity of your dataset. Given the weekly granularity of the Walmart dataset, any date attributes finer than ‘week’ should be excluded for relevance and efficiency.\n\n\nCode\nsales_df_dates = sales_df_with_futureframe.augment_timeseries_signature(date_column = 'Date')\nsales_df_dates.head(10)\n\n\n\n\n\n\n\n\n\nDept\nDate\nWeekly_Sales\nDate_index_num\nDate_year\nDate_year_iso\nDate_yearstart\nDate_yearend\nDate_leapyear\nDate_half\n...\nDate_mday\nDate_qday\nDate_yday\nDate_weekend\nDate_hour\nDate_minute\nDate_second\nDate_msecond\nDate_nsecond\nDate_am_pm\n\n\n\n\n0\n1\n2010-02-05\n24924.50\n1265328000\n2010\n2010\n0\n0\n0\n1\n...\n5\n36\n36\n0\n0\n0\n0\n0\n0\nam\n\n\n1\n1\n2010-02-12\n46039.49\n1265932800\n2010\n2010\n0\n0\n0\n1\n...\n12\n43\n43\n0\n0\n0\n0\n0\n0\nam\n\n\n2\n1\n2010-02-19\n41595.55\n1266537600\n2010\n2010\n0\n0\n0\n1\n...\n19\n50\n50\n0\n0\n0\n0\n0\n0\nam\n\n\n3\n1\n2010-02-26\n19403.54\n1267142400\n2010\n2010\n0\n0\n0\n1\n...\n26\n57\n57\n0\n0\n0\n0\n0\n0\nam\n\n\n4\n1\n2010-03-05\n21827.90\n1267747200\n2010\n2010\n0\n0\n0\n1\n...\n5\n64\n64\n0\n0\n0\n0\n0\n0\nam\n\n\n5\n1\n2010-03-12\n21043.39\n1268352000\n2010\n2010\n0\n0\n0\n1\n...\n12\n71\n71\n0\n0\n0\n0\n0\n0\nam\n\n\n6\n1\n2010-03-19\n22136.64\n1268956800\n2010\n2010\n0\n0\n0\n1\n...\n19\n78\n78\n0\n0\n0\n0\n0\n0\nam\n\n\n7\n1\n2010-03-26\n26229.21\n1269561600\n2010\n2010\n0\n0\n0\n1\n...\n26\n85\n85\n0\n0\n0\n0\n0\n0\nam\n\n\n8\n1\n2010-04-02\n57258.43\n1270166400\n2010\n2010\n0\n0\n0\n1\n...\n2\n2\n92\n0\n0\n0\n0\n0\n0\nam\n\n\n9\n1\n2010-04-09\n42960.91\n1270771200\n2010\n2010\n0\n0\n0\n1\n...\n9\n9\n99\n0\n0\n0\n0\n0\n0\nam\n\n\n\n\n10 rows × 32 columns\n\n\n\nUpon reviewing the generated features, it’s evident that certain attributes don’t align with the granularity of our dataset. For optimal results, features exhibiting no variance—like “Date_hour” due to the weekly nature of our data—should be omitted. We also spot redundant features, such as “Date_Month” and “Date_month_lbl”; both convey month information, albeit in different formats. To enhance clarity and computational efficiency, we’ll refine our dataset to include only the most relevant columns.\nAdditionally, we’ve eliminated certain categorical columns, which, although compatible with models like LightGBM and Catboost, demand extra processing for many tree-based ML models. While 1-hot encoding is a popular method for managing categorical data, it’s not typically recommended for date attributes. Instead, leveraging numeric date features directly, combined with the integration of Fourier features, can effectively capture cyclical patterns.\n\n\nCode\nsales_df_dates.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 1036 rows of 32 columns\nDept: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate: datetime64[ns] [Timestamp('2010-02-05 00:00:00'), ...\nWeekly_Sales: float64 [24924.5, 46039.49, 41595.55, 1940 ...\nDate_index_num: int64 [1265328000, 1265932800, 126653760 ...\nDate_year: int64 [2010, 2010, 2010, 2010, 2010, 201 ...\nDate_year_iso: UInt32 [2010, 2010, 2010, 2010, 2010, 201 ...\nDate_yearstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_yearend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_leapyear: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_half: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate_quarter: int64 [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...\nDate_quarteryear: object ['2010Q1', '2010Q1', '2010Q1', '20 ...\nDate_quarterstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_quarterend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_month: int64 [2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, ...\nDate_month_lbl: object ['February', 'February', 'February ...\nDate_monthstart: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_monthend: uint8 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_yweek: UInt32 [5, 6, 7, 8, 9, 10, 11, 12, 13, 14 ...\nDate_mweek: int64 [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, ...\nDate_wday: int64 [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...\nDate_wday_lbl: object ['Friday', 'Friday', 'Friday', 'Fr ...\nDate_mday: int64 [5, 12, 19, 26, 5, 12, 19, 26, 2, ...\nDate_qday: int64 [36, 43, 50, 57, 64, 71, 78, 85, 2 ...\nDate_yday: int64 [36, 43, 50, 57, 64, 71, 78, 85, 9 ...\nDate_weekend: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_hour: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_minute: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_second: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_msecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_nsecond: int64 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...\nDate_am_pm: object ['am', 'am', 'am', 'am', 'am', 'am ...\n\n\n\n\nCode\nsales_df_dates = sales_df_dates[[\n 'Date'\n ,'Dept'\n , 'Weekly_Sales'\n , 'Date_year'\n , 'Date_month'\n , 'Date_yweek'\n , 'Date_mweek' \n ]]\nsales_df_dates.tail(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\n\n\n\n\n1026\n2012-11-02\n93\nNaN\n2012\n11\n44\n1\n\n\n1027\n2012-11-09\n93\nNaN\n2012\n11\n45\n2\n\n\n1028\n2012-11-16\n93\nNaN\n2012\n11\n46\n3\n\n\n1029\n2012-11-23\n93\nNaN\n2012\n11\n47\n4\n\n\n1030\n2012-11-30\n93\nNaN\n2012\n11\n48\n5\n\n\n1031\n2012-11-02\n95\nNaN\n2012\n11\n44\n1\n\n\n1032\n2012-11-09\n95\nNaN\n2012\n11\n45\n2\n\n\n1033\n2012-11-16\n95\nNaN\n2012\n11\n46\n3\n\n\n1034\n2012-11-23\n95\nNaN\n2012\n11\n47\n4\n\n\n1035\n2012-11-30\n95\nNaN\n2012\n11\n48\n5" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#lag-features-with-tk.augment_lags", + "href": "tutorials/03_demand_forecasting.html#lag-features-with-tk.augment_lags", + "title": "Demand Forecasting", + "section": "2.3 Lag Features with tk.augment_lags", + "text": "2.3 Lag Features with tk.augment_lags\nAs previously noted, it’s important to recognize that machine learning models lack inherent awareness of time, a vital consideration in time series modeling. Furthermore, these models operate under the assumption that each row is independent, meaning that the information from last month’s weekly sales is not inherently integrated into the prediction of next month’s sales target. To address this limitation, we incorporate additional features, such as lags, into the models to capture temporal dependencies. You can easily achieve this by employing the tk.augment_lags function.\n\n\nCode\ndf_with_lags = sales_df_dates \\\n .groupby('Dept') \\\n .augment_lags(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n lags = [5,6,7,8,9]\n )\ndf_with_lags.head(5)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\n\n\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n2\n2010-02-19\n1\n41595.55\n2010\n2\n7\n3\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3\n2010-02-26\n1\n19403.54\n2010\n2\n8\n4\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\n2010-03-05\n1\n21827.90\n2010\n3\n9\n1\nNaN\nNaN\nNaN\nNaN\nNaN" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#rolling-lag-features-with-tk.augment_rolling", + "href": "tutorials/03_demand_forecasting.html#rolling-lag-features-with-tk.augment_rolling", + "title": "Demand Forecasting", + "section": "2.4 Rolling Lag Features with tk.augment_rolling", + "text": "2.4 Rolling Lag Features with tk.augment_rolling\nAnother pivotal aspect of time series analysis involves the utilization of rolling lags. These operations facilitate computations within a moving time window, enabling the use of functions such as “mean” and “std” on these rolling windows. This can be achieved by invoking the tk.augment_rolling() function on grouped time series data. To execute this, we will initially gather all columns containing ‘lag’ in their names. We then apply this function to the lag values, as opposed to the weekly sales, since we lack future weekly sales data. By applying these functions to the lag values, we ensure the prevention of data leakage and maintain the adaptability of our method to unforeseen future data.\n\n\nCode\nlag_columns = [col for col in df_with_lags.columns if 'lag' in col]\n\ndf_with_rolling = df_with_lags \\\n .groupby('Dept') \\\n .augment_rolling(\n date_column = 'Date',\n value_column = lag_columns,\n window = 4,\n window_func = 'mean',\n threads = 1 # Change to -1 to use all available cores\n ) \ndf_with_rolling[df_with_rolling.Dept ==1].head(10)\n\n\n\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\n\n\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n0\n2010-02-05\n1\n24924.50\n2010\n2\n5\n1\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\n2010-02-12\n1\n46039.49\n2010\n2\n6\n2\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n\n\n\nNotice when we add lag values to our dataframe, this creates several NA values. This is because when using lags, there will be some data that is not available early in our dataset.Thus as a result, NA values are introduced.\nTo simplify and clean up the process, we will remove these rows entirely since we already extracted some meaningful information from them (ie. lags, rolling lags).\n\n\nCode\nall_lag_columns = [col for col in df_with_rolling.columns if 'lag' in col]\n\ndf_no_nas = df_with_rolling \\\n .dropna(subset=all_lag_columns, inplace=False)\n\ndf_no_nas.head()\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\n\n\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.9\n19403.54\n22809.285\n21102.8675\n25967.595\n32216.62\n32990.77\n\n\n\n\n\n\n\nWe can call tk.glimpse() again to quickly see what features we still have available.\n\n\nCode\ndf_no_nas.glimpse()\n\n\n<class 'pandas.core.frame.DataFrame'>: 4760 rows of 17 columns\nDate: datetime64[ns] [Timestamp('20 ...\nDept: int64 [1, 1, 1, 1, 1 ...\nWeekly_Sales: float64 [16555.11, 165 ...\nDate_year: int64 [2010, 2010, 2 ...\nDate_month: int64 [4, 4, 4, 4, 4 ...\nDate_yweek: UInt32 [17, 17, 17, 1 ...\nDate_mweek: int64 [5, 5, 5, 5, 5 ...\nWeekly_Sales_lag_5: float64 [26229.21, 262 ...\nWeekly_Sales_lag_6: float64 [22136.64, 221 ...\nWeekly_Sales_lag_7: float64 [21043.39, 210 ...\nWeekly_Sales_lag_8: float64 [21827.9, 2182 ...\nWeekly_Sales_lag_9: float64 [19403.54, 194 ...\nWeekly_Sales_lag_5_rolling_mean_win_4: float64 [22809.285, 22 ...\nWeekly_Sales_lag_6_rolling_mean_win_4: float64 [21102.8675, 2 ...\nWeekly_Sales_lag_7_rolling_mean_win_4: float64 [25967.595, 25 ...\nWeekly_Sales_lag_8_rolling_mean_win_4: float64 [32216.6200000 ...\nWeekly_Sales_lag_9_rolling_mean_win_4: float64 [32990.7700000 ..." + }, + { + "objectID": "tutorials/03_demand_forecasting.html#training-and-future-sets", + "href": "tutorials/03_demand_forecasting.html#training-and-future-sets", + "title": "Demand Forecasting", + "section": "2.5 Training and Future Sets", + "text": "2.5 Training and Future Sets\nNow that we have our training set built, we can start to train our regressor. To do so, let’s first do some model cleanup.\nSplit our data in to train and future sets.\n\n\nCode\nfuture = df_no_nas[df_no_nas.Weekly_Sales.isnull()]\ntrain = df_no_nas[df_no_nas.Weekly_Sales.notnull()]" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#model-with-regressor", + "href": "tutorials/03_demand_forecasting.html#model-with-regressor", + "title": "Demand Forecasting", + "section": "2.6 Model with regressor", + "text": "2.6 Model with regressor\nWe still have a datetime object in our training data. We will need to remove that before passing to our regressor. Let’s subset our column to just the features we want to use for modeling.\n\n\nCode\ntrain_columns = [ \n 'Dept'\n , 'Date_year'\n , 'Date_month'\n , 'Date_yweek'\n , 'Date_mweek'\n , 'Weekly_Sales_lag_5'\n , 'Weekly_Sales_lag_6'\n , 'Weekly_Sales_lag_7'\n , 'Weekly_Sales_lag_8'\n , 'Weekly_Sales_lag_5_rolling_mean_win_4'\n , 'Weekly_Sales_lag_6_rolling_mean_win_4'\n , 'Weekly_Sales_lag_7_rolling_mean_win_4'\n , 'Weekly_Sales_lag_8_rolling_mean_win_4'\n ]\n\nX = train[train_columns]\ny = train[['Weekly_Sales']]\n\nmodel = RandomForestRegressor(random_state=123)\nmodel = model.fit(X, y)\n\n\nNow that we have a trained model, we can pass in our future frame to predict weekly sales.\n\n\nCode\npredicted_values = model.predict(future[train_columns])\nfuture['y_pred'] = predicted_values\n\nfuture.head(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\ny_pred\n\n\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1001\n2012-11-02\n1\nNaN\n2012\n11\n44\n1\n18947.81\n19251.50\n19616.22\n18322.37\n16680.24\n19034.475\n18467.5825\n17726.3075\n17154.9275\n16604.3150\n26627.7378\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n1002\n2012-11-09\n1\nNaN\n2012\n11\n45\n2\n21904.47\n18947.81\n19251.50\n19616.22\n18322.37\n19930.000\n19034.4750\n18467.5825\n17726.3075\n17154.9275\n20959.0553\n\n\n\n\n\n\n\nLet’s create a label to split up our actuals from our prediction dataset before recombining.\n\n\nCode\ntrain['type'] = 'actuals'\nfuture['type'] = 'prediction'\n\nfull_df = pd.concat([train, future])\n\nfull_df.head(10)\n\n\n\n\n\n\n\n\n\nDate\nDept\nWeekly_Sales\nDate_year\nDate_month\nDate_yweek\nDate_mweek\nWeekly_Sales_lag_5\nWeekly_Sales_lag_6\nWeekly_Sales_lag_7\nWeekly_Sales_lag_8\nWeekly_Sales_lag_9\nWeekly_Sales_lag_5_rolling_mean_win_4\nWeekly_Sales_lag_6_rolling_mean_win_4\nWeekly_Sales_lag_7_rolling_mean_win_4\nWeekly_Sales_lag_8_rolling_mean_win_4\nWeekly_Sales_lag_9_rolling_mean_win_4\ntype\ny_pred\n\n\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n12\n2010-04-30\n1\n16555.11\n2010\n4\n17\n5\n26229.21\n22136.64\n21043.39\n21827.90\n19403.54\n22809.2850\n21102.8675\n25967.5950\n32216.620\n32990.77\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN\n\n\n13\n2010-05-07\n1\n17413.94\n2010\n5\n18\n1\n57258.43\n26229.21\n22136.64\n21043.39\n21827.90\n31666.9175\n22809.2850\n21102.8675\n25967.595\n32216.62\nactuals\nNaN" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#pre-visualization-clean-up", + "href": "tutorials/03_demand_forecasting.html#pre-visualization-clean-up", + "title": "Demand Forecasting", + "section": "2.7 Pre-Visualization Clean-up", + "text": "2.7 Pre-Visualization Clean-up\n\n\nCode\nfull_df['Weekly_Sales'] = np.where(full_df.type =='actuals', full_df.Weekly_Sales, full_df.y_pred)" + }, + { + "objectID": "tutorials/03_demand_forecasting.html#plot-predictions", + "href": "tutorials/03_demand_forecasting.html#plot-predictions", + "title": "Demand Forecasting", + "section": "2.8 Plot Predictions", + "text": "2.8 Plot Predictions\n\nPlotlyPlotnine\n\n\n\n\nCode\nfull_df \\\n .groupby('Dept') \\\n .plot_timeseries(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotly'\n )\n\n\n\n \n\n\n\n\n\n\nCode\nfull_df \\\n .groupby('Dept') \\\n .plot_timeseries(\n date_column = 'Date',\n value_column = 'Weekly_Sales',\n color_column = 'type',\n smooth = False,\n smooth_alpha = 0,\n facet_ncol = 2,\n facet_scales = \"free\",\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n engine = 'plotnine'\n )\n\n\n\n\n\n<Figure Size: (800 x 600)>\n\n\n\n\n\nOur weekly sales forecasts exhibit a noticeable alignment with historical trends, indicating that our models are effectively capturing essential data signals. It’s worth noting that with some additional feature engineering, we have the potential to further enhance the model’s performance.\nHere are some additional techniques that can be explored to elevate its performance:\n\nExperiment with the incorporation of various lags using the versatile tk.augment_lags() function.\nEnhance the model’s capabilities by introducing additional rolling calculations through tk.augment_rolling().\nConsider incorporating cyclic features by utilizing tk.augment_fourier().\nTry different models and build a robust cross-validation strategy for model selection.\n\nThese strategies hold promise for refining the model’s accuracy and predictive power" + }, + { + "objectID": "tutorials/02_finance.html", + "href": "tutorials/02_finance.html", + "title": "Finance Analysis", + "section": "", + "text": "Timetk is designed to work with any time series domain. Arguably the most important is Finance. This tutorial showcases how you can perform Financial Investment and Stock Analysis at scale with pytimetk. This applied tutorial covers financial analysis with:\nLoad the following packages before proceeding with this tutorial.\nCode\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np" + }, + { + "objectID": "tutorials/02_finance.html#application-moving-averages-10-day-and-50-day", + "href": "tutorials/02_finance.html#application-moving-averages-10-day-and-50-day", + "title": "Finance Analysis", + "section": "3.1 Application: Moving Averages, 10-Day and 50-Day", + "text": "3.1 Application: Moving Averages, 10-Day and 50-Day\nThis code template can be used to make and visualize the 10-day and 50-Day moving average of a group of stock symbols. Click to expand the code.\n\nPlotlyPlotnine\n\n\n\n\nCode\n# Add 2 moving averages (10-day and 50-Day)\nsma_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = [10, 50],\n window_func = ['mean'],\n center = False,\n threads = 1, # Change to -1 to use all available cores\n )\n\n# Visualize \n(sma_df \n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_10\", \"adjusted_rolling_mean_win_50\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotly\"\n )\n)\n\n\n\n\n\n\n \n\n\n\n\n\n\nCode\n# Add 2 moving averages (10-day and 50-Day)\nsma_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = [10, 50],\n window_func = ['mean'],\n center = False,\n threads = 1, # Change to -1 to use all available cores\n )\n\n# Visualize \n(sma_df \n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_10\", \"adjusted_rolling_mean_win_50\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotnine\"\n )\n)\n\n\n\n\n\n\n\n\n<Figure Size: (900 x 700)>" }, { - "objectID": "performance/01_speed_comparisons.html#chande-momentum-oscillator-cmo-augment_cmo", - "href": "performance/01_speed_comparisons.html#chande-momentum-oscillator-cmo-augment_cmo", - "title": "Speed Comparisons", - "section": "5.1 Chande Momentum Oscillator (CMO) augment_cmo()", - "text": "5.1 Chande Momentum Oscillator (CMO) augment_cmo()\n\nPolars is 3.3X faster than Pandas\nSpeed improvement of Polars (vs Pandas) increases with number of CMO periods\n\n\n\n\n\n\n\nPolarsPandas\n\n\n\n\nCode\n%%timeit -n 25\n\ndf = (\n stocks_daily_df\n .groupby('symbol')\n .augment_cmo(\n date_column = 'date', \n value_column = 'adjusted', \n periods = (5,30),\n engine = 'polars', \n )\n)\n\n# 94.4 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)\n\n\n\n\n\n\nCode\n%%timeit -n 25\n\ndf = (\n stocks_daily_df\n .groupby('symbol')\n .augment_cmo(\n date_column = 'date', \n value_column = 'adjusted', \n periods = (5,30),\n engine = 'pandas', \n )\n)\n\n# 73.3 ms ± 3.29 ms per loop (mean ± std. dev. of 7 runs, 25 loops each)" + "objectID": "tutorials/02_finance.html#application-bollinger-bands", + "href": "tutorials/02_finance.html#application-bollinger-bands", + "title": "Finance Analysis", + "section": "3.2 Application: Bollinger Bands", + "text": "3.2 Application: Bollinger Bands\nBollinger Bands are a volatility indicator commonly used in financial trading. They consist of three lines:\n\nThe middle band, which is a simple moving average (usually over 20 periods).\nThe upper band, calculated as the middle band plus k times the standard deviation of the price (typically, k=2).\nThe lower band, calculated as the middle band minus k times the standard deviation of the price.\n\nHere’s how you can calculate and plot Bollinger Bands with pytimetk using this code template (click to expand):\n\nPlotlyPlotnine\n\n\n\n\nCode\n# Bollinger Bands\nbollinger_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = 20,\n window_func = ['mean', 'std'],\n center = False\n ) \\\n .assign(\n upper_band = lambda x: x['adjusted_rolling_mean_win_20'] + 2*x['adjusted_rolling_std_win_20'],\n lower_band = lambda x: x['adjusted_rolling_mean_win_20'] - 2*x['adjusted_rolling_std_win_20']\n )\n\n\n# Visualize\n(bollinger_df\n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_20\", \"upper_band\", \"lower_band\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n # Adjust colors for Bollinger Bands\n color_palette =[\"#2C3E50\", \"#E31A1C\", '#18BC9C', '#18BC9C'],\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotly\" \n )\n)\n\n\n\n\n\n\n \n\n\n\n\n\n\nCode\n# Bollinger Bands\nbollinger_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'adjusted',\n window = 20,\n window_func = ['mean', 'std'],\n center = False\n ) \\\n .assign(\n upper_band = lambda x: x['adjusted_rolling_mean_win_20'] + 2*x['adjusted_rolling_std_win_20'],\n lower_band = lambda x: x['adjusted_rolling_mean_win_20'] - 2*x['adjusted_rolling_std_win_20']\n )\n\n\n# Visualize\n(bollinger_df\n\n # zoom in on dates\n .query('date >= \"2023-01-01\"') \n\n # Convert to long format\n .melt(\n id_vars = ['symbol', 'date'],\n value_vars = [\"adjusted\", \"adjusted_rolling_mean_win_20\", \"upper_band\", \"lower_band\"]\n ) \n\n # Group on symbol and visualize\n .groupby(\"symbol\") \n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n color_column = 'variable',\n # Adjust colors for Bollinger Bands\n color_palette =[\"#2C3E50\", \"#E31A1C\", '#18BC9C', '#18BC9C'],\n smooth = False, \n facet_ncol = 2,\n width = 900,\n height = 700,\n engine = \"plotnine\"\n )\n)\n\n\n\n\n\n\n\n\n<Figure Size: (900 x 700)>" }, { - "objectID": "reference/augment_rolling.html", - "href": "reference/augment_rolling.html", - "title": "augment_rolling", - "section": "", - "text": "augment_rolling(data, date_column, value_column, window_func='mean', window=2, min_periods=None, engine='pandas', center=False, threads=1, show_progress=True, reduce_memory=False, **kwargs)\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame." + "objectID": "tutorials/02_finance.html#returns-analysis-by-time", + "href": "tutorials/02_finance.html#returns-analysis-by-time", + "title": "Finance Analysis", + "section": "4.1 Returns Analysis By Time", + "text": "4.1 Returns Analysis By Time\n\n\n\n\n\n\nReturns are NOT static (so analyze them by time)\n\n\n\n\n\n\nWe can use rolling window calculations with tk.augment_rolling() to compute many rolling features at scale such as rolling mean, std, range (spread).\nWe can expand our tk.augment_rolling_apply() rolling calculations to Rolling Correlation and Rolling Regression (to make comparisons over time)\n\n\n\n\n\nApplication: Descriptive Statistic Analysis\nMany traders compute descriptive statistics like mean, median, mode, skewness, kurtosis, and standard deviation to understand the central tendency, spread, and shape of the return distribution.\n\n\nStep 1: Returns\nUse this code to get the pct_change() in wide format. Click expand to get the code.\n\n\nCode\nreturns_wide_df = stocks_df[['symbol', 'date', 'adjusted']] \\\n .pivot(index = 'date', columns = 'symbol', values = 'adjusted') \\\n .pct_change() \\\n .reset_index() \\\n [1:]\n\nreturns_wide_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\n\n\n1\n2013-01-03\n-0.012622\n0.004547\n0.000581\n-0.008214\n0.049777\n0.000786\n\n\n2\n2013-01-04\n-0.027854\n0.002592\n0.019760\n0.035650\n-0.006315\n0.032993\n\n\n3\n2013-01-07\n-0.005883\n0.035925\n-0.004363\n0.022949\n0.033549\n-0.028897\n\n\n4\n2013-01-08\n0.002691\n-0.007748\n-0.001974\n-0.012237\n-0.020565\n-0.021926\n\n\n5\n2013-01-09\n-0.015629\n-0.000113\n0.006573\n0.052650\n-0.012865\n-0.022418\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2694\n2023-09-15\n-0.004154\n-0.029920\n-0.004964\n-0.036603\n-0.008864\n-0.036879\n\n\n2695\n2023-09-18\n0.016913\n-0.002920\n0.004772\n0.007459\n-0.006399\n0.001503\n\n\n2696\n2023-09-19\n0.006181\n-0.016788\n-0.000936\n0.008329\n0.004564\n-0.010144\n\n\n2697\n2023-09-20\n-0.019992\n-0.017002\n-0.030541\n-0.017701\n-0.024987\n-0.029435\n\n\n2698\n2023-09-21\n-0.008889\n-0.044053\n-0.023999\n-0.013148\n-0.005566\n-0.028931\n\n\n\n\n2698 rows × 7 columns\n\n\n\n\n\nStep 2: Descriptive Stats\nUse this code to get standard statistics with the describe() method. Click expand to get the code.\n\n\nCode\nreturns_wide_df.describe()\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\n\n\ncount\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n2698.000000\n\n\nmean\n0.001030\n0.001068\n0.000885\n0.001170\n0.001689\n0.002229\n\n\nstd\n0.018036\n0.020621\n0.017267\n0.024291\n0.029683\n0.028320\n\n\nmin\n-0.128647\n-0.140494\n-0.111008\n-0.263901\n-0.351166\n-0.187559\n\n\n25%\n-0.007410\n-0.008635\n-0.006900\n-0.009610\n-0.012071\n-0.010938\n\n\n50%\n0.000892\n0.001050\n0.000700\n0.001051\n0.000544\n0.001918\n\n\n75%\n0.010324\n0.011363\n0.009053\n0.012580\n0.014678\n0.015202\n\n\nmax\n0.119808\n0.141311\n0.160524\n0.296115\n0.422235\n0.298067\n\n\n\n\n\n\n\n\n\nStep 3: Correlation\nAnd run a correlation with corr(). Click expand to get the code.\n\n\nCode\ncorr_table_df = returns_wide_df.drop('date', axis=1).corr()\ncorr_table_df\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\nsymbol\n\n\n\n\n\n\n\n\n\n\nAAPL\n1.000000\n0.497906\n0.566452\n0.479787\n0.321694\n0.526508\n\n\nAMZN\n0.497906\n1.000000\n0.628103\n0.544481\n0.475078\n0.490234\n\n\nGOOG\n0.566452\n0.628103\n1.000000\n0.595728\n0.428470\n0.531382\n\n\nMETA\n0.479787\n0.544481\n0.595728\n1.000000\n0.407417\n0.450586\n\n\nNFLX\n0.321694\n0.475078\n0.428470\n0.407417\n1.000000\n0.380153\n\n\nNVDA\n0.526508\n0.490234\n0.531382\n0.450586\n0.380153\n1.000000\n\n\n\n\n\n\n\n\nThe problem is that the stock market is constantly changing. And these descriptive statistics aren’t representative of the most recent fluctuations. This is where pytimetk comes into play with rolling descriptive statistics.\n\n\n\nApplication: 90-Day Rolling Descriptive Statistics Analysis with tk.augment_rolling()\nLet’s compute and visualize the 90-day rolling statistics.\n\n\n\n\n\n\nGetting More Info: tk.augment_rolling()\n\n\n\n\n\n\nClick here to see our Augmenting Guide\nUse help(tk.augment_rolling) to review additional helpful documentation.\n\n\n\n\n\nStep 1: Long Format Pt.1\nUse this code to get the date melt() into long format. Click expand to get the code.\n\n\nCode\nreturns_long_df = returns_wide_df \\\n .melt(id_vars='date', value_name='returns') \n\nreturns_long_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\n\n\n1\n2013-01-04\nAAPL\n-0.027854\n\n\n2\n2013-01-07\nAAPL\n-0.005883\n\n\n3\n2013-01-08\nAAPL\n0.002691\n\n\n4\n2013-01-09\nAAPL\n-0.015629\n\n\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n\n\n\n\n16188 rows × 3 columns\n\n\n\n\n\nStep 2: Augment Rolling Statistic\nLet’s add multiple columns of rolling statistics. Click to expand the code.\n\n\nCode\nrolling_stats_df = returns_long_df \\\n .groupby('symbol') \\\n .augment_rolling(\n date_column = 'date',\n value_column = 'returns',\n window = [90],\n window_func = [\n 'mean', \n 'std', \n 'min',\n ('q25', lambda x: np.quantile(x, 0.25)),\n 'median',\n ('q75', lambda x: np.quantile(x, 0.75)),\n 'max'\n ],\n threads = 1 # Change to -1 to use all threads\n ) \\\n .dropna()\n\nrolling_stats_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_rolling_mean_win_90\nreturns_rolling_std_win_90\nreturns_rolling_min_win_90\nreturns_rolling_q25_win_90\nreturns_rolling_median_win_90\nreturns_rolling_q75_win_90\nreturns_rolling_max_win_90\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n-0.001702\n0.022233\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n-0.001827\n0.022327\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n-0.001894\n0.022414\n-0.123558\n-0.010533\n-0.001776\n0.012187\n0.041509\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.001680\n0.022467\n-0.123558\n-0.010533\n-0.001360\n0.013120\n0.041509\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n-0.001743\n0.022462\n-0.123558\n-0.010533\n-0.001776\n0.013120\n0.041509\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n0.005159\n0.036070\n-0.056767\n-0.012587\n-0.000457\n0.018480\n0.243696\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.005396\n0.035974\n-0.056767\n-0.011117\n0.000177\n0.018480\n0.243696\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n0.005162\n0.036006\n-0.056767\n-0.011117\n-0.000457\n0.018480\n0.243696\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n0.004953\n0.036153\n-0.056767\n-0.012587\n-0.000457\n0.018480\n0.243696\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n0.004724\n0.036303\n-0.056767\n-0.013166\n-0.000457\n0.018480\n0.243696\n\n\n\n\n15654 rows × 10 columns\n\n\n\n\n\nStep 3: Long Format Pt.2\nFinally, we can .melt() each of the rolling statistics for a Long Format Analysis. Click to expand the code.\n\n\nCode\nrolling_stats_long_df = rolling_stats_df \\\n .melt(\n id_vars = [\"symbol\", \"date\"],\n var_name = \"statistic_type\"\n )\n\nrolling_stats_long_df\n\n\n\n\n\n\n\n\n\nsymbol\ndate\nstatistic_type\nvalue\n\n\n\n\n0\nAAPL\n2013-05-13\nreturns\n0.003908\n\n\n1\nAAPL\n2013-05-14\nreturns\n-0.023926\n\n\n2\nAAPL\n2013-05-15\nreturns\n-0.033817\n\n\n3\nAAPL\n2013-05-16\nreturns\n0.013361\n\n\n4\nAAPL\n2013-05-17\nreturns\n-0.003037\n\n\n...\n...\n...\n...\n...\n\n\n125227\nNVDA\n2023-09-15\nreturns_rolling_max_win_90\n0.243696\n\n\n125228\nNVDA\n2023-09-18\nreturns_rolling_max_win_90\n0.243696\n\n\n125229\nNVDA\n2023-09-19\nreturns_rolling_max_win_90\n0.243696\n\n\n125230\nNVDA\n2023-09-20\nreturns_rolling_max_win_90\n0.243696\n\n\n125231\nNVDA\n2023-09-21\nreturns_rolling_max_win_90\n0.243696\n\n\n\n\n125232 rows × 4 columns\n\n\n\nWith the data formatted properly we can evaluate the 90-Day Rolling Statistics using .plot_timeseries().\n\nPlotlyPlotnine\n\n\n\n\nCode\nrolling_stats_long_df \\\n .groupby(['symbol', 'statistic_type']) \\\n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n facet_ncol = 6,\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Statistics\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nrolling_stats_long_df \\\n .groupby(['symbol', 'statistic_type']) \\\n .plot_timeseries(\n date_column = 'date',\n value_column = 'value',\n facet_ncol = 6,\n facet_dir = 'v',\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Statistics\",\n engine = \"plotnine\"\n )\n\n\n\n\n\n<Figure Size: (1500 x 1000)>" }, { - "objectID": "reference/augment_rolling.html#parameters", - "href": "reference/augment_rolling.html#parameters", - "title": "augment_rolling", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nvalue_column\nUnion[str, list]\nColumn(s) to which the rolling window functions should be applied. Can be a single column name or a list.\nrequired\n\n\nwindow_func\nUnion[str, list, Tuple[str, Callable]]\nThe window_func parameter in the augment_rolling function specifies the function(s) to be applied to the rolling windows of the value column(s). 1. It can be either: - A string representing the name of a standard function (e.g., ‘mean’, ‘sum’). 2. For custom functions: - Provide a list of tuples. Each tuple should contain a custom name for the function and the function itself. - Each custom function should accept a Pandas Series as its input and operate on that series. Example: (“range”, lambda x: x.max() - x.min()) (See more Examples below.) Note: If your function needs to operate on multiple columns (i.e., it requires access to a DataFrame rather than just a Series), consider using the augment_rolling_apply function in this library.\n'mean'\n\n\nwindow\nUnion[int, tuple, list]\nSpecifies the size of the rolling windows. - An integer applies the same window size to all columns in value_column. - A tuple generates windows from the first to the second value (inclusive). - A list of integers designates multiple window sizes for each respective column.\n2\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\ncenter\nbool\nIf True, the rolling window will be centered on the current value. For even-sized windows, the window will be left-biased. Otherwise, it uses a trailing window.\nFalse\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.\nFalse\n\n\nengine\nstr\nSpecifies the backend computation library for augmenting expanding window functions. The options are: - “pandas” (default): Uses the pandas library. - “polars”: Uses the polars library, which may offer performance benefits for larger datasets.\n'pandas'" + "objectID": "tutorials/02_finance.html#about-rolling-correlation", + "href": "tutorials/02_finance.html#about-rolling-correlation", + "title": "Finance Analysis", + "section": "5.1 About: Rolling Correlation", + "text": "5.1 About: Rolling Correlation\nRolling correlation calculates the correlation between two time series over a rolling window of a specified size, moving one period at a time. In stock analysis, this is often used to assess:\n\nDiversification: Helps in identifying how different stocks move in relation to each other, aiding in the creation of a diversified portfolio.\nMarket Dependency: Measures how a particular stock or sector is correlated with a broader market index.\nRisk Management: Helps in identifying changes in correlation structures over time which is crucial for risk assessment and management.\n\nFor example, if the rolling correlation between two stocks starts increasing, it might suggest that they are being influenced by similar factors or market conditions." }, { - "objectID": "reference/augment_rolling.html#returns", - "href": "reference/augment_rolling.html#returns", - "title": "augment_rolling", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_rolling function returns a DataFrame with new columns for each applied function, window size, and value column." + "objectID": "tutorials/02_finance.html#application-rolling-correlation", + "href": "tutorials/02_finance.html#application-rolling-correlation", + "title": "Finance Analysis", + "section": "5.2 Application: Rolling Correlation", + "text": "5.2 Application: Rolling Correlation\nLet’s revisit the returns wide and long format. We can combine these two using the merge() method.\n\nStep 1: Create the return_combinations_long_df\nPerform data wrangling to get the pairwise combinations in long format:\n\nWe first .merge() to join the long returns with the wide returns by date.\nWe then .melt() to get the wide data into long format.\n\n\n\nCode\nreturn_combinations_long_df = returns_long_df \\\n .merge(returns_wide_df, how='left', on = 'date') \\\n .melt(\n id_vars = ['date', 'symbol', 'returns'],\n var_name = \"comp\",\n value_name = \"returns_comp\"\n )\nreturn_combinations_long_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\ncomp\nreturns_comp\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\nAAPL\n-0.012622\n\n\n1\n2013-01-04\nAAPL\n-0.027854\nAAPL\n-0.027854\n\n\n2\n2013-01-07\nAAPL\n-0.005883\nAAPL\n-0.005883\n\n\n3\n2013-01-08\nAAPL\n0.002691\nAAPL\n0.002691\n\n\n4\n2013-01-09\nAAPL\n-0.015629\nAAPL\n-0.015629\n\n\n...\n...\n...\n...\n...\n...\n\n\n97123\n2023-09-15\nNVDA\n-0.036879\nNVDA\n-0.036879\n\n\n97124\n2023-09-18\nNVDA\n0.001503\nNVDA\n0.001503\n\n\n97125\n2023-09-19\nNVDA\n-0.010144\nNVDA\n-0.010144\n\n\n97126\n2023-09-20\nNVDA\n-0.029435\nNVDA\n-0.029435\n\n\n97127\n2023-09-21\nNVDA\n-0.028931\nNVDA\n-0.028931\n\n\n\n\n97128 rows × 5 columns\n\n\n\n\n\nStep 2: Add Rolling Correlations with tk.augment_rolling_apply()\nNext, let’s add rolling correlations.\n\nWe first .groupby() on the combination of our target assets “symbol” and our comparison asset “comp”.\nThen we use a different function, tk.augment_rolling_apply().\n\n\n\n\n\n\n\ntk.augment_rolling() vs tk.augment_rolling_apply()\n\n\n\n\n\n\nFor the vast majority of operations, tk.augment_rolling() will suffice. It’s used on a single column where there is a simple rolling transformation applied to only the value_column.\nFor more complex cases where other columns beyond a value_column are needed (e.g. rolling correlations, rolling regressions), the tk.augment_rolling_apply() comes to the rescue.\ntk.augment_rolling_apply() exposes the group’s columns as a DataFrame to window function, thus allowing for multi-column analysis.\n\n\n\n\n\n\n\n\n\n\ntk.augment_rolling_apply() has no value_column\n\n\n\n\n\nThis is because the rolling apply passes a DataFrame containing all columns to the custom function. The custom function is then responsible for handling the columns internally. This is how you can select multiple columns to work with.\n\n\n\n\n\nCode\nreturn_corr_df = return_combinations_long_df \\\n .groupby([\"symbol\", \"comp\"]) \\\n .augment_rolling_apply(\n date_column = \"date\",\n window = 90,\n window_func=[('corr', lambda x: x['returns'].corr(x['returns_comp']))],\n threads = 1, # Change to -1 to use all available cores\n )\n\nreturn_corr_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\ncomp\nreturns_comp\nrolling_corr_win_90\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\nAAPL\n-0.012622\nNaN\n\n\n1\n2013-01-04\nAAPL\n-0.027854\nAAPL\n-0.027854\nNaN\n\n\n2\n2013-01-07\nAAPL\n-0.005883\nAAPL\n-0.005883\nNaN\n\n\n3\n2013-01-08\nAAPL\n0.002691\nAAPL\n0.002691\nNaN\n\n\n4\n2013-01-09\nAAPL\n-0.015629\nAAPL\n-0.015629\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n97123\n2023-09-15\nNVDA\n-0.036879\nNVDA\n-0.036879\n1.0\n\n\n97124\n2023-09-18\nNVDA\n0.001503\nNVDA\n0.001503\n1.0\n\n\n97125\n2023-09-19\nNVDA\n-0.010144\nNVDA\n-0.010144\n1.0\n\n\n97126\n2023-09-20\nNVDA\n-0.029435\nNVDA\n-0.029435\n1.0\n\n\n97127\n2023-09-21\nNVDA\n-0.028931\nNVDA\n-0.028931\n1.0\n\n\n\n\n97128 rows × 6 columns\n\n\n\n\n\nStep 3: Visualize the Rolling Correlation\nWe can use tk.plot_timeseries() to visualize the 90-day rolling correlation. It’s interesting to see that stock combinations such as AAPL | AMZN returns have a high positive correlation of 0.80, but this relationship was much lower 0.25 before 2015.\n\nThe blue smoother can help us detect trends\nThe y_intercept is useful in this case to draw lines at -1, 0, and 1\n\n\nPlotlyPlotnine\n\n\n\n\nCode\nreturn_corr_df \\\n .dropna() \\\n .groupby(['symbol', 'comp']) \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"rolling_corr_win_90\",\n facet_ncol = 6,\n y_intercept = [-1,0,1],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Correlation\",\n engine = \"plotly\"\n )\n\n\n\n \n\n\n\n\n\n\nCode\nreturn_corr_df \\\n .dropna() \\\n .groupby(['symbol', 'comp']) \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"rolling_corr_win_90\",\n facet_ncol = 6,\n y_intercept = [-1,0,1],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 1500,\n height = 1000,\n title = \"90-Day Rolling Correlation\",\n engine = \"plotnine\"\n )\n\n\n\n\n\n<Figure Size: (1500 x 1000)>\n\n\n\n\n\nFor comparison, we can examine the corr_table_df from the Descriptive Statistics Analysis:\n\nNotice that the values tend not to match the most recent trends\nFor example APPL | AMZN is correlated at 0.49 over the entire time period. But more recently this correlation has dropped to 0.17 in the 90-Day Rolling Correlation chart.\n\n\n\nCode\ncorr_table_df\n\n\n\n\n\n\n\n\nsymbol\nAAPL\nAMZN\nGOOG\nMETA\nNFLX\nNVDA\n\n\nsymbol\n\n\n\n\n\n\n\n\n\n\nAAPL\n1.000000\n0.497906\n0.566452\n0.479787\n0.321694\n0.526508\n\n\nAMZN\n0.497906\n1.000000\n0.628103\n0.544481\n0.475078\n0.490234\n\n\nGOOG\n0.566452\n0.628103\n1.000000\n0.595728\n0.428470\n0.531382\n\n\nMETA\n0.479787\n0.544481\n0.595728\n1.000000\n0.407417\n0.450586\n\n\nNFLX\n0.321694\n0.475078\n0.428470\n0.407417\n1.000000\n0.380153\n\n\nNVDA\n0.526508\n0.490234\n0.531382\n0.450586\n0.380153\n1.000000" }, { - "objectID": "reference/augment_rolling.html#notes", - "href": "reference/augment_rolling.html#notes", - "title": "augment_rolling", - "section": "Notes", - "text": "Notes" + "objectID": "tutorials/02_finance.html#about-rolling-regression", + "href": "tutorials/02_finance.html#about-rolling-regression", + "title": "Finance Analysis", + "section": "5.3 About: Rolling Regression", + "text": "5.3 About: Rolling Regression\nRolling regression involves running regression analyses over rolling windows of data points to assess the relationship between a dependent and one or more independent variables. In the context of stock analysis, it can be used to:\n\nBeta Estimation: It can be used to estimate the beta of a stock (a measure of market risk) against a market index over different time periods. A higher beta indicates higher market-related risk.\nMarket Timing: It can be useful in identifying changing relationships between stocks and market indicators, helping traders to adjust their positions accordingly.\nHedge Ratio Determination: It helps in determining the appropriate hedge ratios for pairs trading or other hedging strategies." }, { - "objectID": "reference/augment_rolling.html#performance", - "href": "reference/augment_rolling.html#performance", - "title": "augment_rolling", - "section": "Performance", - "text": "Performance\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." + "objectID": "tutorials/02_finance.html#application-90-day-rolling-regression", + "href": "tutorials/02_finance.html#application-90-day-rolling-regression", + "title": "Finance Analysis", + "section": "5.4 Application: 90-Day Rolling Regression", + "text": "5.4 Application: 90-Day Rolling Regression\n\n\n\n\n\n\nThis Application Requires Scikit Learn\n\n\n\n\n\nWe need to make a regression function that returns the Slope and Intercept. Scikit Learn has an easy-to-use modeling interface. You may need to pip install scikit-learn to use this applied tutorial.\n\n\n\n\nStep 1: Get Market Returns\nFor our purposes, we assume the market is the average returns of the 6 technology stocks.\n\nWe calculate an equal-weight portfolio as the “market returns”.\nThen we merge the market returns into the returns long data.\n\n\n\nCode\n# Assume Market Returns = Equal Weight Portfolio\nmarket_returns_df = returns_wide_df \\\n .set_index(\"date\") \\\n .assign(returns_market = lambda df: df.sum(axis = 1) * (1 / df.shape[1])) \\\n .reset_index() \\\n [['date', 'returns_market']]\n\n# Merge with returns long\nreturns_long_market_df = returns_long_df \\\n .merge(market_returns_df, how='left', on='date')\n\nreturns_long_market_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\n\n\n\n\n0\n2013-01-03\nAAPL\n-0.012622\n0.005809\n\n\n1\n2013-01-04\nAAPL\n-0.027854\n0.009471\n\n\n2\n2013-01-07\nAAPL\n-0.005883\n0.008880\n\n\n3\n2013-01-08\nAAPL\n0.002691\n-0.010293\n\n\n4\n2013-01-09\nAAPL\n-0.015629\n0.001366\n\n\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\n\n\n\n\n16188 rows × 4 columns\n\n\n\n\n\nStep 2: Run a Rolling Regression\nNext, run the following code to perform a rolling regression:\n\nUse a custom regression function that will return the slope and intercept as a pandas series.\nRun the rolling regression with tk.augment_rolling_apply().\n\n\n\nCode\ndef regression(df):\n \n # External functions must \n from sklearn.linear_model import LinearRegression\n\n model = LinearRegression()\n X = df[['returns_market']] # Extract X values (independent variables)\n y = df['returns'] # Extract y values (dependent variable)\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n\nreturn_regression_df = returns_long_market_df \\\n .groupby('symbol') \\\n .augment_rolling_apply(\n date_column = \"date\",\n window = 90,\n window_func = [('regression', regression)],\n threads = 1, # Change to -1 to use all available cores \n ) \\\n .dropna()\n\nreturn_regression_df\n\n\n\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\nrolling_regression_win_90\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n0.007082\nIntercept -0.001844 Slope 0.061629 dt...\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n0.007583\nIntercept -0.001959 Slope 0.056540 dt...\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n0.005381\nIntercept -0.002036 Slope 0.062330 dt...\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.009586\nIntercept -0.001789 Slope 0.052348 dt...\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n0.009005\nIntercept -0.001871 Slope 0.055661 dt...\n\n\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\nIntercept 0.000100 Slope 1.805479 dt...\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\nIntercept 0.000207 Slope 1.800813 dt...\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\nIntercept 0.000301 Slope 1.817878 dt...\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\nIntercept 0.000845 Slope 1.825818 dt...\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\nIntercept 0.000901 Slope 1.818710 dt...\n\n\n\n\n15654 rows × 5 columns\n\n\n\n\n\nStep 3: Extract the Slope Coefficient (Beta)\nThis is more of a hack than anything to extract the beta (slope) of the rolling regression.\n\n\nCode\nintercept_slope_df = pd.concat(return_regression_df['rolling_regression_win_90'].to_list(), axis=1).T \n\nintercept_slope_df.index = return_regression_df.index\n\nreturn_beta_df = pd.concat([return_regression_df, intercept_slope_df], axis=1)\n\nreturn_beta_df\n\n\n\n\n\n\n\n\n\ndate\nsymbol\nreturns\nreturns_market\nrolling_regression_win_90\nIntercept\nSlope\n\n\n\n\n89\n2013-05-13\nAAPL\n0.003908\n0.007082\nIntercept -0.001844 Slope 0.061629 dt...\n-0.001844\n0.061629\n\n\n90\n2013-05-14\nAAPL\n-0.023926\n0.007583\nIntercept -0.001959 Slope 0.056540 dt...\n-0.001959\n0.056540\n\n\n91\n2013-05-15\nAAPL\n-0.033817\n0.005381\nIntercept -0.002036 Slope 0.062330 dt...\n-0.002036\n0.062330\n\n\n92\n2013-05-16\nAAPL\n0.013361\n-0.009586\nIntercept -0.001789 Slope 0.052348 dt...\n-0.001789\n0.052348\n\n\n93\n2013-05-17\nAAPL\n-0.003037\n0.009005\nIntercept -0.001871 Slope 0.055661 dt...\n-0.001871\n0.055661\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16183\n2023-09-15\nNVDA\n-0.036879\n-0.020231\nIntercept 0.000100 Slope 1.805479 dt...\n0.000100\n1.805479\n\n\n16184\n2023-09-18\nNVDA\n0.001503\n0.003555\nIntercept 0.000207 Slope 1.800813 dt...\n0.000207\n1.800813\n\n\n16185\n2023-09-19\nNVDA\n-0.010144\n-0.001466\nIntercept 0.000301 Slope 1.817878 dt...\n0.000301\n1.817878\n\n\n16186\n2023-09-20\nNVDA\n-0.029435\n-0.023276\nIntercept 0.000845 Slope 1.825818 dt...\n0.000845\n1.825818\n\n\n16187\n2023-09-21\nNVDA\n-0.028931\n-0.020764\nIntercept 0.000901 Slope 1.818710 dt...\n0.000901\n1.818710\n\n\n\n\n15654 rows × 7 columns\n\n\n\n\n\nStep 4: Visualize the Rolling Beta\n\nPlotlyPlotnine\n\n\n\n\nCode\nreturn_beta_df \\\n .groupby('symbol') \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"Slope\",\n facet_ncol = 2,\n facet_scales = \"free_x\",\n y_intercept = [0, 3],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n title = \"90-Day Rolling Regression\",\n engine = \"plotly\",\n )\n\n\n\n \n\n\n\n\n\n\nCode\nreturn_beta_df \\\n .groupby('symbol') \\\n .plot_timeseries(\n date_column = \"date\",\n value_column = \"Slope\",\n facet_ncol = 2,\n facet_scales = \"free_x\",\n y_intercept = [0, 3],\n y_intercept_color = tk.palette_timetk()['steel_blue'],\n width = 800,\n height = 600,\n title = \"90-Day Rolling Regression\",\n engine = \"plotnine\",\n )\n\n\n\n\n\n<Figure Size: (800 x 600)>" }, { - "objectID": "reference/augment_rolling.html#examples", - "href": "reference/augment_rolling.html#examples", - "title": "augment_rolling", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\n\n# Example 1 - Using a single window size and a single function name, pandas engine\n# This example demonstrates the use of both string-named functions and lambda \n# functions on a rolling window. We specify a list of window sizes: [2,7]. \n# As a result, the output will have computations for both window sizes 2 and 7.\n# Note - It's preferred to use built-in or configurable functions instead of \n# lambda functions for performance reasons.\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = [2,7], # Specifying multiple window sizes\n window_func = [\n 'mean', # Built-in mean function\n ('std', lambda x: x.std()) # Lambda function to compute standard deviation\n ],\n threads = 1, # Disabling parallel processing\n engine = 'pandas' # Using pandas engine\n )\n)\ndisplay(rolled_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_7\nvalue_rolling_std_win_7\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.80\n0.10\n2061.800000\n13.037830\n\n\n4\nD10\n2014-07-07\n2006.4\n2027.65\n21.25\n2050.720000\n25.041038\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9425.35\n6.55\n9382.071429\n74.335988\n\n\n9739\nD500\n2012-09-20\n9365.7\n9392.25\n26.55\n9396.400000\n58.431303\n\n\n9740\nD500\n2012-09-21\n9445.9\n9405.80\n40.10\n9419.114286\n39.184451\n\n\n9741\nD500\n2012-09-22\n9497.9\n9471.90\n26.00\n9438.928571\n38.945336\n\n\n9742\nD500\n2012-09-23\n9545.3\n9521.60\n23.70\n9449.028571\n53.379416\n\n\n\n\n9743 rows × 7 columns\n\n\n\n\n# Example 2 - Multiple groups, pandas engine\n# Example showcasing the use of string function names and lambda functions \n# applied on rolling windows. The `window` tuple (1,3) will generate window \n# sizes of 1, 2, and 3.\n# Note - It's preferred to use built-in or configurable functions instead of \n# lambda functions for performance reasons.\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,3), # Specifying a range of window sizes\n window_func = [\n 'mean', # Using built-in mean function\n ('std', lambda x: x.std()) # Lambda function for standard deviation\n ],\n threads = 1, # Disabling parallel processing\n engine = 'pandas' # Using pandas engine\n )\n)\ndisplay(rolled_df) \n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_1\nvalue_rolling_std_win_1\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_3\nvalue_rolling_std_win_3\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.2\n0.0\n2076.20\n0.00\n2076.200000\n0.000000\n\n\n1\nD10\n2014-07-04\n2073.4\n2073.4\n0.0\n2074.80\n1.40\n2074.800000\n1.400000\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.7\n0.0\n2061.05\n12.35\n2066.100000\n12.356645\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.9\n0.0\n2048.80\n0.10\n2057.000000\n11.596839\n\n\n4\nD10\n2014-07-07\n2006.4\n2006.4\n0.0\n2027.65\n21.25\n2034.666667\n19.987718\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9418.8\n0.0\n9425.35\n6.55\n9429.466667\n7.905413\n\n\n9739\nD500\n2012-09-20\n9365.7\n9365.7\n0.0\n9392.25\n26.55\n9405.466667\n28.623339\n\n\n9740\nD500\n2012-09-21\n9445.9\n9445.9\n0.0\n9405.80\n40.10\n9410.133333\n33.310092\n\n\n9741\nD500\n2012-09-22\n9497.9\n9497.9\n0.0\n9471.90\n26.00\n9436.500000\n54.378182\n\n\n9742\nD500\n2012-09-23\n9545.3\n9545.3\n0.0\n9521.60\n23.70\n9496.366667\n40.594362\n\n\n\n\n9743 rows × 9 columns\n\n\n\n\n# Example 3 - Multiple groups, polars engine\n\nrolled_df = (\n df\n .groupby('id')\n .augment_rolling(\n date_column = 'date', \n value_column = 'value', \n window = (1,3), # Specifying a range of window sizes\n window_func = [\n 'mean', # Using built-in mean function\n 'std', # Using built-in standard deviation function\n ],\n engine = 'polars' # Using polars engine\n )\n)\ndisplay(rolled_df) \n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_rolling_mean_win_1\nvalue_rolling_std_win_1\nvalue_rolling_mean_win_2\nvalue_rolling_std_win_2\nvalue_rolling_mean_win_3\nvalue_rolling_std_win_3\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.2\nNaN\n2076.20\nNaN\n2076.200000\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\n2073.4\nNaN\n2074.80\n1.979899\n2074.800000\n1.979899\n\n\n2\nD10\n2014-07-05\n2048.7\n2048.7\nNaN\n2061.05\n17.465537\n2066.100000\n15.133737\n\n\n3\nD10\n2014-07-06\n2048.9\n2048.9\nNaN\n2048.80\n0.141421\n2057.000000\n14.203169\n\n\n4\nD10\n2014-07-07\n2006.4\n2006.4\nNaN\n2027.65\n30.052038\n2034.666667\n24.479856\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n9418.8\nNaN\n9425.35\n9.263099\n9429.466667\n9.682114\n\n\n9739\nD500\n2012-09-20\n9365.7\n9365.7\nNaN\n9392.25\n37.547370\n9405.466667\n35.056288\n\n\n9740\nD500\n2012-09-21\n9445.9\n9445.9\nNaN\n9405.80\n56.709964\n9410.133333\n40.796364\n\n\n9741\nD500\n2012-09-22\n9497.9\n9497.9\nNaN\n9471.90\n36.769553\n9436.500000\n66.599399\n\n\n9742\nD500\n2012-09-23\n9545.3\n9545.3\nNaN\n9521.60\n33.516861\n9496.366667\n49.717737\n\n\n\n\n9743 rows × 9 columns" + "objectID": "changelog-news.html", + "href": "changelog-news.html", + "title": "Changelog for Pytimetk", + "section": "", + "text": "Integration with timebasedcv #291. New Classes:\n\nTimeSeriesCV(): An enhanced version of TimeBasedSplit() that defaults to mode = \"backwards\", allows for maximum splits using split_limit, and adds enhanced diagnostics like glimpse() and plot()\n\n\n\n\nA plotly dropdown automates the group-wise analysis. Instead of facets, which are only powerful for <=9 plots at a time, a dropdown can easily visualize more plots.\n\nplot_timeseries(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\nplot_anomalies(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\n\n\n\n\n\nplot_timeseries(value_column = list(), color_column=list()): Now supports multiple columns in wide format for grouped time series data visualization. #136" }, { - "objectID": "reference/augment_rolling_apply.html", - "href": "reference/augment_rolling_apply.html", - "title": "augment_rolling_apply", + "objectID": "changelog-news.html#new-features", + "href": "changelog-news.html#new-features", + "title": "Changelog for Pytimetk", "section": "", - "text": "augment_rolling_apply(data, date_column, window_func, window=2, min_periods=None, center=False, threads=1, show_progress=True)\nApply one or more DataFrame-based rolling functions and window sizes to one or more columns of a DataFrame." + "text": "Integration with timebasedcv #291. New Classes:\n\nTimeSeriesCV(): An enhanced version of TimeBasedSplit() that defaults to mode = \"backwards\", allows for maximum splits using split_limit, and adds enhanced diagnostics like glimpse() and plot()\n\n\n\n\nA plotly dropdown automates the group-wise analysis. Instead of facets, which are only powerful for <=9 plots at a time, a dropdown can easily visualize more plots.\n\nplot_timeseries(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\nplot_anomalies(): Gets new parameters plotly_dropdown, plotly_dropdown_x, plotly_dropdown_y #301\n\n\n\n\n\nplot_timeseries(value_column = list(), color_column=list()): Now supports multiple columns in wide format for grouped time series data visualization. #136" }, { - "objectID": "reference/augment_rolling_apply.html#parameters", - "href": "reference/augment_rolling_apply.html#parameters", - "title": "augment_rolling_apply", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nwindow_func\nUnion[Tuple[str, Callable], List[Tuple[str, Callable]]]\nThe window_func parameter in the augment_rolling_apply function specifies the function(s) that operate on a rolling window with the consideration of multiple columns. The specification can be: - A tuple where the first element is a string representing the function’s name and the second element is the callable function itself. - A list of such tuples for multiple functions. (See more Examples below.) Note: For functions targeting only a single value column without the need for contextual data from other columns, consider using the augment_rolling function in this library.\nrequired\n\n\nwindow\nUnion[int, tuple, list]\nSpecifies the size of the rolling windows. - An integer applies the same window size to all columns in value_column. - A tuple generates windows from the first to the second value (inclusive). - A list of integers designates multiple window sizes for each respective column.\n2\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\ncenter\nbool\nIf True, the rolling window will be centered on the current value. For even-sized windows, the window will be left-biased. Otherwise, it uses a trailing window.\nFalse\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue" + "objectID": "changelog-news.html#fixes", + "href": "changelog-news.html#fixes", + "title": "Changelog for Pytimetk", + "section": "Fixes:", + "text": "Fixes:\n\ntk.summarize_by_time(): AttributeError: ‘DataFrame’ object has no attribute ‘groupby’ #298" }, { - "objectID": "reference/augment_rolling_apply.html#returns", - "href": "reference/augment_rolling_apply.html#returns", - "title": "augment_rolling_apply", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_rolling function returns a DataFrame with new columns for each applied function, window size, and value column." + "objectID": "changelog-news.html#pandas-and-polars-compatibility", + "href": "changelog-news.html#pandas-and-polars-compatibility", + "title": "Changelog for Pytimetk", + "section": "Pandas and Polars Compatibility:", + "text": "Pandas and Polars Compatibility:\nUpgrading to:\n\npandas >= 2.0.0\npolars >= 1.2.0\n\nUse pytimetk <=0.4.0 to support:\n\npandas <2.0.0\npolars <1.0.0" }, { - "objectID": "reference/augment_rolling_apply.html#notes", - "href": "reference/augment_rolling_apply.html#notes", - "title": "augment_rolling_apply", - "section": "Notes", - "text": "Notes" + "objectID": "changelog-news.html#improvements", + "href": "changelog-news.html#improvements", + "title": "Changelog for Pytimetk", + "section": "Improvements:", + "text": "Improvements:\n\nImplement sort_dataframe(): This function is used internally to make sure Polars and Pandas engines perform grouped operations consistently and correctly. #286 #290\n.augment_lags() and .augment_leads(): value_column now accepts any dtype. #295" }, { - "objectID": "reference/augment_rolling_apply.html#performance", - "href": "reference/augment_rolling_apply.html#performance", - "title": "augment_rolling_apply", - "section": "Performance", - "text": "Performance\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." + "objectID": "changelog-news.html#feature-engineering-module", + "href": "changelog-news.html#feature-engineering-module", + "title": "Changelog for Pytimetk", + "section": "Feature Engineering Module:", + "text": "Feature Engineering Module:\n\naugment_pct_change(): pandas and polars engines" }, { - "objectID": "reference/augment_rolling_apply.html#examples", - "href": "reference/augment_rolling_apply.html#examples", - "title": "augment_rolling_apply", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Example 1 - showcasing the rolling correlation between two columns \n# (`value1` and `value2`).\n# The correlation requires both columns as input.\n\n# Sample DataFrame with id, date, value1, and value2 columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [2, 16, 20, 40, 41, 50],\n})\n\n# Compute the rolling correlation for each group of 'id'\n# Using a rolling window of size 3 and a lambda function to calculate the \n# correlation.\n\nrolled_df = (\n df.groupby('id')\n .augment_rolling_apply(\n date_column='date',\n window=3,\n window_func=[('corr', lambda x: x['value1'].corr(x['value2']))], # Lambda function for correlation\n center = False, # Not centering the rolling window\n threads = 1 # Increase threads for parallel processing (use -1 for all cores)\n )\n)\ndisplay(rolled_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nrolling_corr_win_3\n\n\n\n\n0\n1\n2023-01-01\n10\n2\nNaN\n\n\n1\n1\n2023-01-02\n20\n16\nNaN\n\n\n2\n1\n2023-01-03\n29\n20\n0.961054\n\n\n3\n2\n2023-01-04\n42\n40\nNaN\n\n\n4\n2\n2023-01-05\n53\n41\nNaN\n\n\n5\n2\n2023-01-06\n59\n50\n0.824831\n\n\n\n\n\n\n\n\n# Example 2 - Rolling Regression Example: Using `value1` as the dependent \n# variable and `value2` and `value3` as the independent variables. This \n# example demonstrates how to perform a rolling regression using two \n# independent variables.\n\n# Sample DataFrame with `id`, `date`, `value1`, `value2`, and `value3` columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [5, 16, 24, 35, 45, 58],\n 'value3': [2, 3, 6, 9, 10, 13]\n})\n\n# Define Regression Function to be applied on the rolling window.\ndef regression(df):\n\n # Required module (scikit-learn) for regression.\n # This import statement is required inside the function to avoid errors.\n from sklearn.linear_model import LinearRegression\n\n model = LinearRegression()\n X = df[['value2', 'value3']] # Independent variables\n y = df['value1'] # Dependent variable\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n \n# Compute the rolling regression for each group of `id`\n# Using a rolling window of size 3 and the regression function.\nrolled_df = (\n df.groupby('id')\n .augment_rolling_apply(\n date_column='date',\n window=3,\n window_func=[('regression', regression)]\n )\n .dropna()\n)\n\n# Format the results to have each regression output (slope and intercept) in \n# separate columns.\n\nregression_wide_df = pd.concat(rolled_df['rolling_regression_win_3'].to_list(), axis=1).T\n\nregression_wide_df = pd.concat([rolled_df.reset_index(drop = True), regression_wide_df], axis=1)\n\ndisplay(regression_wide_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nvalue3\nrolling_regression_win_3\nIntercept\nSlope\n\n\n\n\n0\n1\n2023-01-03\n29\n24\n6\nIntercept 4.28 Slope 0.84 dtype: flo...\n4.280000\n0.840000\n\n\n1\n2\n2023-01-06\n59\n58\n13\nIntercept 30.352941 Slope 1.588235 ...\n30.352941\n1.588235" + "objectID": "changelog-news.html#finance-module-updates", + "href": "changelog-news.html#finance-module-updates", + "title": "Changelog for Pytimetk", + "section": "Finance Module Updates:", + "text": "Finance Module Updates:\n\naugment_macd(): MACD, pandas and polars engines\naugment_bbands(): Bollinger Bands, pandas and polars engines\naugment_atr(): Average True Range, pandas and polars engines\naugment_ppo(): Percentage Price Oscillator, pandas and polars engines\naugment_rsi(): Relative Strength Index, pandas and polars engines\naugment_qsmomentum(): Quant Science Momentum Indicator, pandas and polars engines\naugment_roc(): Rate of Change (ROC), pandas and polars engines" }, { - "objectID": "reference/ceil_date.html", - "href": "reference/ceil_date.html", - "title": "ceil_date", + "objectID": "changelog-news.html#polars-upgrades", + "href": "changelog-news.html#polars-upgrades", + "title": "Changelog for Pytimetk", + "section": "Polars Upgrades", + "text": "Polars Upgrades\n\nMigrate to polars 0.20.7" + }, + { + "objectID": "reference/plot_timeseries.html", + "href": "reference/plot_timeseries.html", + "title": "plot_timeseries", "section": "", - "text": "ceil_date(idx, unit='D')\nRobust date ceiling.\nThe ceil_date function takes a pandas Series of dates and returns a new Series with the dates rounded up to the next specified unit. It’s more robust than the pandas ceil function, which does weird things with irregular frequencies like Month which are actually regular." + "text": "plot_timeseries(data, date_column, value_column, color_column=None, color_palette=None, facet_ncol=1, facet_nrow=None, facet_scales='free_y', facet_dir='h', line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', smooth=True, smooth_color='#3366FF', smooth_frac=0.2, smooth_size=1.0, smooth_alpha=1.0, legend_show=True, title='Time Series Plot', x_lab='', y_lab='', color_lab='Legend', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly', plotly_dropdown=False, plotly_dropdown_x=0, plotly_dropdown_y=1)\nCreates time series plots using different plotting engines such as Plotnine, Matplotlib, and Plotly." }, { - "objectID": "reference/ceil_date.html#parameters", - "href": "reference/ceil_date.html#parameters", - "title": "ceil_date", + "objectID": "reference/plot_timeseries.html#parameters", + "href": "reference/plot_timeseries.html#parameters", + "title": "plot_timeseries", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter is a pandas Series or pandas DatetimeIndex object that contains datetime values. It represents the dates that you want to round down.\nrequired\n\n\nunit\nstr\nThe unit parameter in the ceil_date function is a string that specifies the time unit to which the dates in the idx series should be rounded down. It has a default value of “D”, which stands for day. Other possible values for the unit parameter could be\n'D'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe input data for the plot. It can be either a Pandas DataFrame or a Pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the DataFrame that contains the dates for the time series data.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is used to specify the name of the column in the DataFrame that contains the values for the time series data. This column will be plotted on the y-axis of the time series plot. LONG-FORMAT PLOTTING: If the value_column parameter is a string, it will be treated as a single column name. To plot multiple time series, group the DataFrame first using pd.DataFrame.groupby(). WIDE-FORMAT PLOTTING: If the value_column parameter is a list, it will plotted as multiple time series (wide-format).\nrequired\n\n\ncolor_column\nstr\nThe color_column parameter is an optional parameter that specifies the column in the DataFrame that will be used to assign colors to the different time series. If this parameter is not provided, all time series will have the same color. LONG-FORMAT PLOTTING: The color_column parameter is a single column name. WIDE-FORMAT PLOTTING: The color_column parameter must be the same list as the value_column parameter to color the different time series when performing wide-format plotting.\nNone\n\n\ncolor_palette\nlist\nThe color_palette parameter is used to specify the colors to be used for the different time series. It accepts a list of color codes or names. If the color_column parameter is not provided, the tk.palette_timetk() color palette will be used.\nNone\n\n\nfacet_ncol\nint\nThe facet_ncol parameter determines the number of columns in the facet grid. It specifies how many subplots will be arranged horizontally in the plot.\n1\n\n\nfacet_nrow\nint\nThe facet_nrow parameter determines the number of rows in the facet grid. It specifies how many subplots will be arranged vertically in the grid.\nNone\n\n\nfacet_scales\nstr\nThe facet_scales parameter determines the scaling of the y-axis in the facetted plots. It can take the following values: - “free_y”: The y-axis scale will be free for each facet, but the x-axis scale will be fixed for all facets. This is the default value. - “free_x”: The y-axis scale will be free for each facet, but the x-axis scale will be fixed for all facets. - “free”: The y-axis scale will be free for each facet (subplot). This is the default value.\n'free_y'\n\n\nfacet_dir\nstr\nThe facet_dir parameter determines the direction in which the facets (subplots) are arranged. It can take two possible values: - “h”: The facets will be arranged horizontally (in rows). This is the default value. - “v”: The facets will be arranged vertically (in columns).\n'h'\n\n\nline_color\nstr\nThe line_color parameter is used to specify the color of the lines in the time series plot. It accepts a string value representing a color code or name. The default value is “#2c3e50”, which corresponds to a dark blue color.\n'#2c3e50'\n\n\nline_size\nfloat\nThe line_size parameter is used to specify the size of the lines in the time series plot. It determines the thickness of the lines.\nNone\n\n\nline_type\nstr\nThe line_type parameter is used to specify the type of line to be used in the time series plot.\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the time series plot. It accepts a value between 0 and 1, where 0 means completely transparent (invisible) and 1 means completely opaque (solid).\n1.0\n\n\ny_intercept\nfloat\nThe y_intercept parameter is used to add a horizontal line to the plot at a specific y-value. It can be set to a numeric value to specify the y-value of the intercept. If set to None (default), no y-intercept line will be added to the plot\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line in the plot. It accepts a string value representing a color code or name. The default value is “#2c3e50”, which corresponds to a dark blue color. You can change this value.\n'#2c3e50'\n\n\nx_intercept\nstr\nThe x_intercept parameter is used to add a vertical line at a specific x-axis value on the plot. It is used to highlight a specific point or event in the time series data. - By default, it is set to None, which means no vertical line will be added. - You can use a date string to specify the x-axis value of the intercept. For example, “2020-01-01” would add a vertical line at the beginning of the year 2020.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line that represents the x-intercept in the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this value to any valid color code.\n'#2c3e50'\n\n\nsmooth\nbool\nThe smooth parameter is a boolean indicating whether or not to apply smoothing to the time eries data. If set to True, the time series will be smoothed using the lowess algorithm. The default value is True.\nTrue\n\n\nsmooth_color\nstr\nThe smooth_color parameter is used to specify the color of the smoothed line in the time series plot. It accepts a string value representing a color code or name. The default value is #3366FF, which corresponds to a shade of blue. You can change this value to any valid color code.\n'#3366FF'\n\n\nsmooth_frac\nfloat\nThe smooth_frac parameter is used to control the fraction of data points used for smoothing the time series. It determines the degree of smoothing applied to the data. A smaller value of smooth_frac will result in more smoothing, while a larger value will result in less smoothing. The default value is 0.2.\n0.2\n\n\nsmooth_size\nfloat\nThe smooth_size parameter is used to specify the size of the line used to plot the smoothed values in the time series plot. It is a numeric value that controls the thickness of the line. A larger value will result in a thicker line, while a smaller value will result in a thinner line\n1.0\n\n\nsmooth_alpha\nfloat\nThe smooth_alpha parameter controls the transparency of the smoothed line in the plot. It accepts a value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\nlegend_show\nbool\nThe legend_show parameter is a boolean indicating whether or not to show the legend in the plot. If set to True, the legend will be displayed. The default value is True.\nTrue\n\n\ntitle\nstr\nThe title of the plot.\n'Time Series Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis in the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis in the plot. It is a string that represents the label for the y-axis.\n''\n\n\ncolor_lab\nstr\nThe color_lab parameter is used to specify the label for the legend or color scale in the plot. It is used to provide a description of the colors used in the plot, typically when a color column is specified.\n'Legend'\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2020).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter is used to set the base font size for the plot. It determines the size of the text elements such as axis labels, titles, and legends.\n11\n\n\nwidth\nint\nThe width parameter is used to specify the width of the plot. It determines the horizontal size of the plot in pixels.\nNone\n\n\nheight\nint\nThe height parameter is used to specify the height of the plot in pixels. It determines the vertical size of the plot when it is rendered.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting library to use for creating the time series plot. It can take one of the following values: - “plotly” (interactive): Use the plotly library to create the plot. This is the default value. - “plotnine” (static): Use the plotnine library to create the plot. This is the default value. - “matplotlib” (static): Use the matplotlib library to create the plot.\n'plotly'\n\n\nplotly_dropdown\nbool\nFor analyzing many plots. When set to True and groups are provided, the function switches from faceting to create a dropdown menu to switch between different groups. Default: False.\nFalse\n\n\nplotly_dropdown_x\nfloat\nThe x-axis location of the dropdown. Default: 0.\n0\n\n\nplotly_dropdown_y\nfloat\nThe y-axis location of the dropdown. Default: 1.\n1" }, { - "objectID": "reference/ceil_date.html#returns", - "href": "reference/ceil_date.html#returns", - "title": "ceil_date", + "objectID": "reference/plot_timeseries.html#returns", + "href": "reference/plot_timeseries.html#returns", + "title": "plot_timeseries", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.Series\nThe ceil_date function returns a pandas Series object containing datetime64[ns] values." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function plot_timeseries returns a plot object, depending on the\nspecified engine parameter. - If engine is set to ‘plotnine’ or ‘matplotlib’, the function returns a plot object that can be further customized or displayed. - If engine is set to ‘plotly’, the function returns a plotly figure object." }, { - "objectID": "reference/ceil_date.html#examples", - "href": "reference/ceil_date.html#examples", - "title": "ceil_date", + "objectID": "reference/plot_timeseries.html#examples", + "href": "reference/plot_timeseries.html#examples", + "title": "plot_timeseries", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndates = pd.date_range(\"2020-01-01\", \"2020-01-10\", freq=\"1H\")\ndates\n\nDatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 01:00:00',\n '2020-01-01 02:00:00', '2020-01-01 03:00:00',\n '2020-01-01 04:00:00', '2020-01-01 05:00:00',\n '2020-01-01 06:00:00', '2020-01-01 07:00:00',\n '2020-01-01 08:00:00', '2020-01-01 09:00:00',\n ...\n '2020-01-09 15:00:00', '2020-01-09 16:00:00',\n '2020-01-09 17:00:00', '2020-01-09 18:00:00',\n '2020-01-09 19:00:00', '2020-01-09 20:00:00',\n '2020-01-09 21:00:00', '2020-01-09 22:00:00',\n '2020-01-09 23:00:00', '2020-01-10 00:00:00'],\n dtype='datetime64[ns]', length=217, freq='H')\n\n\n\n# Pandas ceil fails on month\n# dates.ceil(\"M\") # ValueError: <MonthEnd> is a non-fixed frequency\n\n# Works on Month\ntk.ceil_date(dates, unit=\"M\")\n\n0 2020-02-01\n1 2020-02-01\n2 2020-02-01\n3 2020-02-01\n4 2020-02-01\n ... \n212 2020-02-01\n213 2020-02-01\n214 2020-02-01\n215 2020-02-01\n216 2020-02-01\nName: idx, Length: 217, dtype: datetime64[ns]" + "text": "Examples\n\nimport pytimetk as tk\n\ndf = tk.load_dataset('m4_monthly', parse_dates = ['date'])\n\n# Plotly Object: Single Time Series\nfig = (\n df\n .query('id == \"M750\"')\n .plot_timeseries(\n 'date', 'value', \n facet_ncol = 1,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Grouped Time Series (Facets)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n facet_ncol = 2,\n facet_scales = \"free_y\",\n smooth_frac = 0.2,\n smooth_size = 2.0,\n y_intercept = None,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n width = 600,\n height = 500,\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Grouped Time Series (Plotly Dropdown)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n facet_scales = \"free_y\",\n smooth_frac = 0.2,\n smooth_size = 2.0,\n y_intercept = None,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n width = 600,\n height = 500,\n plotly_dropdown = True, # Plotly Dropdown\n )\n)\nfig\n\n\n \n\n\n\n# Plotly Object: Color Column\nfig = (\n df\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n smooth = False,\n y_intercept = 0,\n x_axis_date_labels = \"%Y\",\n engine = 'plotly',\n )\n)\nfig\n\n\n \n\n\n\n# Plotnine Object: Single Time Series\nfig = (\n df\n .query('id == \"M1\"')\n .plot_timeseries(\n 'date', 'value', \n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Plotnine Object: Grouped Time Series\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value',\n facet_ncol = 2,\n facet_scales = \"free\",\n line_size = 0.35,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine'\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Plotnine Object: Color Column\nfig = (\n df\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n smooth = False,\n y_intercept = 0,\n x_axis_date_labels = \"%Y\",\n engine = 'plotnine',\n )\n)\nfig\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# Matplotlib object (same as plotnine, but converted to matplotlib object)\nfig = (\n df\n .groupby('id')\n .plot_timeseries(\n 'date', 'value', \n color_column = 'id',\n facet_ncol = 2,\n x_axis_date_labels = \"%Y\",\n engine = 'matplotlib',\n )\n)\nfig" }, { - "objectID": "reference/augment_expanding.html", - "href": "reference/augment_expanding.html", - "title": "augment_expanding", + "objectID": "reference/augment_diffs.html", + "href": "reference/augment_diffs.html", + "title": "augment_diffs", "section": "", - "text": "augment_expanding(data, date_column, value_column, window_func='mean', min_periods=None, engine='pandas', threads=1, show_progress=True, reduce_memory=False, **kwargs)\nApply one or more Series-based expanding functions to one or more columns of a DataFrame." + "text": "augment_diffs(data, date_column, value_column, periods=1, normalize=False, reduce_memory=False, engine='pandas')\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\nThe augment_diffs function takes a Pandas DataFrame or GroupBy object, a date column, a value column or list of value columns, and a period or list of periods, and adds differenced versions of the value columns to the DataFrame." }, { - "objectID": "reference/augment_expanding.html#parameters", - "href": "reference/augment_expanding.html#parameters", - "title": "augment_expanding", + "objectID": "reference/augment_diffs.html#parameters", + "href": "reference/augment_diffs.html#parameters", + "title": "augment_diffs", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nvalue_column\nUnion[str, list]\nColumn(s) to which the expanding window functions should be applied. Can be a single column name or a list.\nrequired\n\n\nwindow_func\nUnion[str, list, Tuple[str, Callable]]\nThe window_func parameter in the augment_expanding function specifies the function(s) to be applied to the expanding windows of the value column(s). 1. It can be either: - A string representing the name of a standard function (e.g., ‘mean’, ‘sum’). 2. For custom functions: - Provide a list of tuples. Each tuple should contain a custom name for the function and the function itself. - Each custom function should accept a Pandas Series as its input and operate on that series. Example: (“range”, lambda x: x.max() - x.min()) (See more Examples below.) Note: If your function needs to operate on multiple columns (i.e., it requires access to a DataFrame rather than just a Series), consider using the augment_expanding_apply function in this library.\n'mean'\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\nengine\nstr\nSpecifies the backend computation library for augmenting expanding window functions. The options are: - “pandas” (default): Uses the pandas library. - “polars”: Uses the polars library, which may offer performance benefits for larger datasets.\n'pandas'\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\n**kwargs\nadditional keyword arguments\nAdditional arguments passed to the pandas.Series.expanding method when using the Pandas engine.\n{}" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data parameter is the input DataFrame or DataFrameGroupBy object that you want to add differenced columns to.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the DataFrame that contains the dates. This column will be used to sort the data before adding the differenced values.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is the column(s) in the DataFrame that you want to add differences values for. It can be either a single column name (string) or a list of column names.\nrequired\n\n\nperiods\nint or tuple or list\nThe periods parameter is an integer, tuple, or list that specifies the periods to shift values when differencing. - If it is an integer, the function will add that number of differences values for each column specified in the value_column parameter. - If it is a tuple, it will generate differences from the first to the second value (inclusive). - If it is a list, it will generate differences based on the values in the list.\n1\n\n\nnormalize\nbool\nThe normalize parameter is used to specify whether to normalize the differenced values as a percentage difference. Default is False.\nFalse\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for augmenting differences. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for augmenting diffs. This can be faster than using “pandas” for large datasets.\n'pandas'" }, { - "objectID": "reference/augment_expanding.html#returns", - "href": "reference/augment_expanding.html#returns", - "title": "augment_expanding", + "objectID": "reference/augment_diffs.html#returns", + "href": "reference/augment_diffs.html#returns", + "title": "augment_diffs", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_expanding function returns a DataFrame with new columns for each applied function, window size, and value column." - }, - { - "objectID": "reference/augment_expanding.html#notes", - "href": "reference/augment_expanding.html#notes", - "title": "augment_expanding", - "section": "Notes", - "text": "Notes" - }, - { - "objectID": "reference/augment_expanding.html#performance", - "href": "reference/augment_expanding.html#performance", - "title": "augment_expanding", - "section": "Performance", - "text": "Performance\n\nPolars Engine (3X faster than Pandas)\nIn most cases, the polars engine will be faster than the pandas engine. Speed tests indicate 3X or more.\n\n\nParallel Processing (Pandas Engine Only)\nThis function uses parallel processing to speed up computation for large datasets with many time series groups:\nParallel processing has overhead and may not be faster on small datasets.\nTo use parallel processing, set threads = -1 to use all available processors." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA Pandas DataFrame with differenced columns added to it." }, { - "objectID": "reference/augment_expanding.html#examples", - "href": "reference/augment_expanding.html#examples", - "title": "augment_expanding", + "objectID": "reference/augment_diffs.html#examples", + "href": "reference/augment_diffs.html#examples", + "title": "augment_diffs", "section": "Examples", - "text": "Examples\n\n# Example 1 - Pandas Backend for Expanding Window Functions\n# This example demonstrates the use of string-named functions \n# on an expanding window using the Pandas backend for computations.\n \nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in standard deviation function,\n ('quantile_75', lambda x: pd.Series(x).quantile(0.75)), # Custom quantile function\n \n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n threads = 1, # Disable parallel processing\n show_progress = True, # Display a progress bar\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_mean\nvalue_expanding_std\nvalue_expanding_quantile_75\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.200000\nNaN\n2076.200\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.800000\n1.979899\n2075.500\n\n\n2\nD10\n2014-07-05\n2048.7\n2066.100000\n15.133737\n2074.800\n\n\n3\nD10\n2014-07-06\n2048.9\n2061.800000\n15.054789\n2074.100\n\n\n4\nD10\n2014-07-07\n2006.4\n2050.720000\n27.996732\n2073.400\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n8286.606679\n2456.960489\n9903.550\n\n\n9739\nD500\n2012-09-20\n9365.7\n8286.864035\n2456.723940\n9902.600\n\n\n9740\nD500\n2012-09-21\n9445.9\n8287.140391\n2456.496163\n9902.075\n\n\n9741\nD500\n2012-09-22\n9497.9\n8287.429011\n2456.274422\n9901.550\n\n\n9742\nD500\n2012-09-23\n9545.3\n8287.728789\n2456.058410\n9901.025\n\n\n\n\n9743 rows × 6 columns\n\n\n\n\n# Example 2 - Polars Backend for Expanding Window Functions using Built-Ins \n# (538X Faster than Pandas)\n# This example demonstrates the use of string-named functions and configurable \n# functions using the Polars backend for computations. Configurable functions, \n# like pl_quantile, allow the use of specific parameters associated with their \n# corresponding polars.Expr.rolling_<function_name> method.\n# For instance, pl_quantile corresponds to polars.Expr.rolling_quantile.\n \nimport pytimetk as tk\nimport pandas as pd\nimport polars as pl\nimport numpy as np\nfrom pytimetk.utils.polars_helpers import pl_quantile\nfrom pytimetk.utils.pandas_helpers import pd_quantile\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n 'mean', # Built-in mean function\n 'std', # Built-in std function\n ('quantile_75', pl_quantile(quantile=0.75)), # Configurable with all parameters found in polars.Expr.rolling_quantile\n ],\n min_periods = 1,\n engine = 'polars', # Utilize Polars for the underlying computations\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_mean\nvalue_expanding_std\nvalue_expanding_quantile_75\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n2076.200000\nNaN\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n2074.800000\n1.979899\n2076.2\n\n\n2\nD10\n2014-07-05\n2048.7\n2066.100000\n15.133737\n2076.2\n\n\n3\nD10\n2014-07-06\n2048.9\n2061.800000\n15.054789\n2076.2\n\n\n4\nD10\n2014-07-07\n2006.4\n2050.720000\n27.996732\n2073.4\n\n\n...\n...\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n8286.606679\n2456.960489\n9906.4\n\n\n9739\nD500\n2012-09-20\n9365.7\n8286.864035\n2456.723940\n9902.6\n\n\n9740\nD500\n2012-09-21\n9445.9\n8287.140391\n2456.496163\n9902.6\n\n\n9741\nD500\n2012-09-22\n9497.9\n8287.429011\n2456.274422\n9902.6\n\n\n9742\nD500\n2012-09-23\n9545.3\n8287.728789\n2456.058410\n9902.6\n\n\n\n\n9743 rows × 6 columns\n\n\n\n\n# Example 3 - Lambda Functions for Expanding Window Functions are faster in Pandas than Polars\n# This example demonstrates the use of lambda functions of the form lambda x: x\n# Identity lambda functions, while convenient, have signficantly slower performance.\n# When using lambda functions the Pandas backend will likely be faster than Polars.\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\ndf = tk.load_dataset(\"m4_daily\", parse_dates = ['date'])\n\nexpanded_df = (\n df\n .groupby('id')\n .augment_expanding(\n date_column = 'date', \n value_column = 'value', \n window_func = [\n \n ('range', lambda x: x.max() - x.min()), # Identity lambda function: can be slower, especially in Polars\n ],\n min_periods = 1,\n engine = 'pandas', # Utilize pandas for the underlying computations\n )\n)\ndisplay(expanded_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_expanding_range\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n0.0\n\n\n1\nD10\n2014-07-04\n2073.4\n2.8\n\n\n2\nD10\n2014-07-05\n2048.7\n27.5\n\n\n3\nD10\n2014-07-06\n2048.9\n27.5\n\n\n4\nD10\n2014-07-07\n2006.4\n69.8\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n10782.0\n\n\n9739\nD500\n2012-09-20\n9365.7\n10782.0\n\n\n9740\nD500\n2012-09-21\n9445.9\n10782.0\n\n\n9741\nD500\n2012-09-22\n9497.9\n10782.0\n\n\n9742\nD500\n2012-09-23\n9545.3\n10782.0\n\n\n\n\n9743 rows × 4 columns" + "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('m4_daily', parse_dates=['date'])\ndf\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n\n\n9739\nD500\n2012-09-20\n9365.7\n\n\n9740\nD500\n2012-09-21\n9445.9\n\n\n9741\nD500\n2012-09-22\n9497.9\n\n\n9742\nD500\n2012-09-23\n9545.3\n\n\n\n\n9743 rows × 3 columns\n\n\n\n\n# Example 1 - Add 7 differenced values for a single DataFrame object, pandas engine\ndiffed_df_single = (\n df \n .query('id == \"D10\"')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=(1, 7),\n engine='pandas'\n )\n)\ndiffed_df_single.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 674 rows of 10 columns\nid: object ['D10', 'D10', 'D10', 'D10', 'D10', 'D1 ...\ndate: datetime64[ns] [Timestamp('2014-07-03 00:00:00'), Time ...\nvalue: float64 [2076.2, 2073.4, 2048.7, 2048.9, 2006.4 ...\nvalue_diff_1: float64 [nan, -2.799999999999727, -24.700000000 ...\nvalue_diff_2: float64 [nan, nan, -27.5, -24.5, -42.2999999999 ...\nvalue_diff_3: float64 [nan, nan, nan, -27.299999999999727, -6 ...\nvalue_diff_4: float64 [nan, nan, nan, nan, -69.79999999999973 ...\nvalue_diff_5: float64 [nan, nan, nan, nan, nan, -58.599999999 ...\nvalue_diff_6: float64 [nan, nan, nan, nan, nan, nan, -57.0999 ...\nvalue_diff_7: float64 [nan, nan, nan, nan, nan, nan, nan, -68 ...\n\n\n\n# Example 2 - Add a single differenced value of 2 for each GroupBy object, polars engine\ndiffed_df = (\n df \n .groupby('id')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=2,\n engine='polars'\n )\n)\ndiffed_df\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_diff_2\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n-27.5\n\n\n3\nD10\n2014-07-06\n2048.9\n-24.5\n\n\n4\nD10\n2014-07-07\n2006.4\n-42.3\n\n\n...\n...\n...\n...\n...\n\n\n9738\nD500\n2012-09-19\n9418.8\n-18.9\n\n\n9739\nD500\n2012-09-20\n9365.7\n-66.2\n\n\n9740\nD500\n2012-09-21\n9445.9\n27.1\n\n\n9741\nD500\n2012-09-22\n9497.9\n132.2\n\n\n9742\nD500\n2012-09-23\n9545.3\n99.4\n\n\n\n\n9743 rows × 4 columns\n\n\n\n\n# Example 3 add 2 differenced values, 2 and 4, for a single DataFrame object, pandas engine\ndiffed_df_single_two = (\n df \n .query('id == \"D10\"')\n .augment_diffs(\n date_column='date',\n value_column='value',\n periods=[2, 4],\n engine='pandas'\n )\n)\ndiffed_df_single_two\n\n\n\n\n\n\n\n\nid\ndate\nvalue\nvalue_diff_2\nvalue_diff_4\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\nNaN\nNaN\n\n\n1\nD10\n2014-07-04\n2073.4\nNaN\nNaN\n\n\n2\nD10\n2014-07-05\n2048.7\n-27.5\nNaN\n\n\n3\nD10\n2014-07-06\n2048.9\n-24.5\nNaN\n\n\n4\nD10\n2014-07-07\n2006.4\n-42.3\n-69.8\n\n\n...\n...\n...\n...\n...\n...\n\n\n669\nD10\n2016-05-02\n2630.7\n57.8\n50.8\n\n\n670\nD10\n2016-05-03\n2649.3\n48.3\n105.3\n\n\n671\nD10\n2016-05-04\n2631.8\n1.1\n58.9\n\n\n672\nD10\n2016-05-05\n2622.5\n-26.8\n21.5\n\n\n673\nD10\n2016-05-06\n2620.1\n-11.7\n-10.6\n\n\n\n\n674 rows × 5 columns" }, { - "objectID": "reference/augment_cmo.html", - "href": "reference/augment_cmo.html", - "title": "augment_cmo", + "objectID": "reference/time_scale_template.html", + "href": "reference/time_scale_template.html", + "title": "time_scale_template", "section": "", - "text": "augment_cmo(data, date_column, close_column, periods=14, reduce_memory=False, engine='pandas')\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial instrument using either pandas or polars engine, and returns the augmented DataFrame." + "text": "time_scale_template(wide_format=False, engine='pandas')\nThe function time_scale_template returns a table with time scale information in either wide or long format." }, { - "objectID": "reference/augment_cmo.html#parameters", - "href": "reference/augment_cmo.html#parameters", - "title": "augment_cmo", + "objectID": "reference/time_scale_template.html#parameters", + "href": "reference/time_scale_template.html#parameters", + "title": "time_scale_template", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It contains the data on which the Chande Momentum Oscillator (CMO) will be calculated.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates or timestamps.\nrequired\n\n\nclose_column\nstr\nThe close_column parameter is used to specify the column in the input data that contain the values on which the CMO will be calculated.\nrequired\n\n\nperiods\nUnion[int, Tuple[int, int], List[int]]\nThe periods parameter in the augment_cmo function specifies the number of rolling periods over which the Chande Momentum Oscillator (CMO) is calculated. It can be provided as an integer, a tuple of two integers (start and end periods), or a list of integers.\n14\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is a boolean flag that indicates whether or not to reduce the memory usage of the data before performing the CMO calculation. If set to True, the function will attempt to reduce the memory usage of the input data. If set to False, the function will not attempt to reduce the memory usage of the input data.\nFalse\n\n\nengine\nstr\nThe engine parameter specifies the computation engine to use for calculating the Chande Momentum Oscillator (CMO). It can take two values: ‘pandas’ or ‘polars’.\n'pandas'" - }, - { - "objectID": "reference/augment_cmo.html#returns", - "href": "reference/augment_cmo.html#returns", - "title": "augment_cmo", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function augment_cmo returns a pandas DataFrame that contains the augmented data with the Chande Momentum Oscillator (CMO) values added." - }, - { - "objectID": "reference/augment_cmo.html#notes", - "href": "reference/augment_cmo.html#notes", - "title": "augment_cmo", - "section": "Notes", - "text": "Notes\nThe Chande Momentum Oscillator (CMO), developed by Tushar Chande, is a technical analysis tool used to gauge the momentum of a financial instrument. It is similar to other momentum indicators like the Relative Strength Index (RSI), but with some distinct characteristics. Here’s what the CMO tells us:\nMomentum of Price Movements:\nThe CMO measures the strength of trends in price movements. It calculates the difference between the sum of gains and losses over a specified period, normalized to oscillate between -100 and +100. Overbought and Oversold Conditions:\nValues close to +100 suggest overbought conditions, indicating that the price might be too high and could reverse. Conversely, values near -100 suggest oversold conditions, implying that the price might be too low and could rebound. Trend Strength:\nHigh absolute values (either positive or negative) indicate strong trends, while values near zero suggest a lack of trend or a weak trend. Divergences:\nDivergences between the CMO and price movements can be significant. For example, if the price is making new highs but the CMO is declining, it may indicate weakening momentum and a potential trend reversal. Crossing the Zero Line:\nWhen the CMO crosses above zero, it can be seen as a bullish signal, whereas a cross below zero can be interpreted as bearish. Customization:\nThe period over which the CMO is calculated can be adjusted. A shorter period makes the oscillator more sensitive to price changes, suitable for short-term trading. A longer period smooths out the oscillator for a longer-term perspective. It’s important to note that while the CMO can provide valuable insights into market momentum and potential price reversals, it is most effective when used in conjunction with other indicators and analysis methods. Like all technical indicators, the CMO should not be used in isolation but rather as part of a comprehensive trading strategy.\nReferences: 1. https://www.fmlabs.com/reference/default.htm?url=CMO.htm" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nwide_format\nbool\nThe wide_format parameter determines the format of the output table. If wide_format is set to True, the table will be transposed.\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the time scale information.\n'pandas'" }, { - "objectID": "reference/augment_cmo.html#examples", - "href": "reference/augment_cmo.html#examples", - "title": "augment_cmo", + "objectID": "reference/time_scale_template.html#examples", + "href": "reference/time_scale_template.html#examples", + "title": "time_scale_template", "section": "Examples", - "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('stocks_daily', parse_dates=['date'])\ndf\n\n# Example 1 - Calculate CMO for a single column\ncmo_df = (\n df\n .query(\"symbol == 'AAPL'\")\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28]\n )\n)\ncmo_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n5398\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000\n16.791180\nNaN\nNaN\n\n\n5399\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200\n16.579241\nNaN\nNaN\n\n\n5400\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600\n16.117437\nNaN\nNaN\n\n\n5401\nAAPL\n2013-01-07\n18.642857\n18.903570\n18.400000\n18.710714\n484156400\n16.022623\nNaN\nNaN\n\n\n5402\nAAPL\n2013-01-08\n18.900356\n18.996071\n18.616072\n18.761070\n458707200\n16.065746\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n8092\nAAPL\n2023-09-15\n176.479996\n176.500000\n173.820007\n175.009995\n109205100\n175.009995\n-11.097429\n-6.370009\n\n\n8093\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600\n177.970001\n-6.564165\n-2.713367\n\n\n8094\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900\n179.070007\n-16.295529\n1.931561\n\n\n8095\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200\n175.490005\n-39.175190\n-3.650570\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n\n\n2699 rows × 10 columns\n\n\n\n\n# Example 2 - Calculate CMO for multiple groups\ncmo_df = (\n df\n .groupby('symbol')\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28]\n )\n)\ncmo_df.groupby('symbol').tail(1)\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n2698\nMETA\n2023-09-21\n295.700012\n300.260010\n293.269989\n295.730011\n21300500\n295.730011\n-0.277495\n-4.703549\n\n\n5397\nAMZN\n2023-09-21\n131.940002\n132.240005\n129.309998\n129.330002\n70234800\n129.330002\n-27.450935\n-16.697312\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n10795\nNFLX\n2023-09-21\n386.500000\n395.899994\n383.420013\n384.149994\n5547900\n384.149994\n-56.124625\n-19.430200\n\n\n13494\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000\n410.170013\n-83.624257\n0.671283\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n-33.482367\n2.917383\n\n\n\n\n\n\n\n\n# Example 3 - Calculate CMO for polars engine\ncmo_df = (\n df\n .query(\"symbol == 'AAPL'\")\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28],\n engine='polars'\n )\n)\ncmo_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000\n16.791180\nNaN\nNaN\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200\n16.579241\nNaN\nNaN\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600\n16.117437\nNaN\nNaN\n\n\n3\nAAPL\n2013-01-07\n18.642857\n18.903570\n18.400000\n18.710714\n484156400\n16.022623\nNaN\nNaN\n\n\n4\nAAPL\n2013-01-08\n18.900356\n18.996071\n18.616072\n18.761070\n458707200\n16.065746\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2694\nAAPL\n2023-09-15\n176.479996\n176.500000\n173.820007\n175.009995\n109205100\n175.009995\n-11.097429\n-6.370009\n\n\n2695\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600\n177.970001\n-6.564165\n-2.713367\n\n\n2696\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900\n179.070007\n-16.295529\n1.931561\n\n\n2697\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200\n175.490005\n-39.175190\n-3.650570\n\n\n2698\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n\n\n2699 rows × 10 columns\n\n\n\n\n# Example 4 - Calculate CMO for polars engine and groups\ncmo_df = (\n df\n .groupby('symbol')\n .augment_cmo(\n date_column='date',\n close_column='adjusted',\n periods=[14, 28],\n engine='polars'\n )\n)\ncmo_df.groupby('symbol').tail(1)\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\nadjusted_cmo_14\nadjusted_cmo_28\n\n\n\n\n2698\nMETA\n2023-09-21\n295.700012\n300.260010\n293.269989\n295.730011\n21300500\n295.730011\n-0.277495\n-4.703549\n\n\n5397\nAMZN\n2023-09-21\n131.940002\n132.240005\n129.309998\n129.330002\n70234800\n129.330002\n-27.450935\n-16.697312\n\n\n8096\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900\n173.929993\n-43.051227\n-6.140628\n\n\n10795\nNFLX\n2023-09-21\n386.500000\n395.899994\n383.420013\n384.149994\n5547900\n384.149994\n-56.124625\n-19.430200\n\n\n13494\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000\n410.170013\n-83.624257\n0.671283\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n-33.482367\n2.917383" + "text": "Examples\n\nimport pytimetk as tk\n\ntk.time_scale_template()\n\n\n\n\n\n\n\n\nmedian_unit\nseasonal_period\ntrend_period\n\n\n\n\n0\nS\n1H\n12H\n\n\n1\nT\n1D\n14D\n\n\n2\nH\n1D\n1M\n\n\n3\nD\n1W\n3M\n\n\n4\nW\n1Q\n1Y\n\n\n5\nM\n1Y\n5Y\n\n\n6\nQ\n1Y\n10Y\n\n\n7\nY\n5Y\n30Y" }, - { - "objectID": "reference/augment_wavelet.html", - "href": "reference/augment_wavelet.html", - "title": "augment_wavelet", + { + "objectID": "reference/pad_by_time.html", + "href": "reference/pad_by_time.html", + "title": "pad_by_time", "section": "", - "text": "augment_wavelet(data, date_column, value_column, method, sample_rate, scales, reduce_memory=False)\nApply the Wavely transform to specified columns of a DataFrame or DataFrameGroupBy object.\nA wavelet transform is a mathematical tool used to decompose a signal or function into different frequency components and then study each component with a resolution matched to its scale. The wavelet transform uses wavelets, which are functions that are localized in both time and frequency.\nUses:" + "text": "pad_by_time(data, date_column, freq='D', start_date=None, end_date=None)\nMake irregular time series regular by padding with missing dates.\nThe pad_by_time function inserts missing dates into a Pandas DataFrame or DataFrameGroupBy object, through the process making an irregularly spaced time series regularly spaced." }, { - "objectID": "reference/augment_wavelet.html#parameters", - "href": "reference/augment_wavelet.html#parameters", - "title": "augment_wavelet", + "objectID": "reference/pad_by_time.html#parameters", + "href": "reference/pad_by_time.html#parameters", + "title": "pad_by_time", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nInput DataFrame or DataFrameGroupBy object with one or more columns of real-valued signals.\nrequired\n\n\nvalue_column\nstr or list\nList of column names in ‘data’ to which the Hilbert transform will be applied.\nrequired\n\n\nsample_rate\nstr\nSampling rate of the input data. For time-series data, the sample rate (sample_rate) typically refers to the frequency at which data points are collected. For example, if your data has a 30-minute interval, if you think of the data in terms of “samples per hour”, the sample rate would be: sample_rate = samples / hour = 1 / 0.5 = 2\nrequired\n\n\nscales\nstr or list\nArray of scales to use in the transform. The choice of scales in wavelet analysis determines which frequencies (or periodicities) in the data you want to analyze. In other words, the scales determine the “window size” or the “look-back period” the wavelet uses to analyze the data. Smaller scales: Correspond to analyzing high-frequency changes (short-term fluctuations) in the data. Larger scales: Correspond to analyzing low-frequency changes (long-term fluctuations) in the data. The specific values for scales depend on what frequencies or periodicities you expect in your data and wish to study. For instance, if you believe there are daily, weekly, and monthly patterns in your data, you’d choose scales that correspond to these periodicities given your sampling rate. For a daily pattern with data at 30-minute intervals: scales = 2 * 24 = 48 because there are 48 half hour intervals in a day For a weekly pattern with data at 30-minute intervals: scales = 48 * 7 = 336 because there are 336 half hour intervals in a week Recommendation, use a range of values to cover both short term and long term patterns, then adjust accordingly.\nrequired\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is False.\nFalse" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data parameter can be either a Pandas DataFrame or a Pandas DataFrameGroupBy object. It represents the data that you want to pad with missing dates.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the DataFrame that contains the dates. This column will be used to determine the minimum and maximum dates in theDataFrame, and to generate the regular date range for padding.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the missing timestamps should be generated. It accepts a string representing a pandas frequency alias. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - B: business day frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - BMS: Business month start - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nstart_date\nstr\nSpecifies the start of the padded series. If NULL, it will use the lowest value of the input variable. In the case of groups, it will use the lowest value by group.\nNone\n\n\nend_date\nstr\nSpecifies the end of the padded series. If NULL, it will use the highest value of the input variable. In the case of groups, it will use the highest value by group.\nNone" }, { - "objectID": "reference/augment_wavelet.html#returns", - "href": "reference/augment_wavelet.html#returns", - "title": "augment_wavelet", + "objectID": "reference/pad_by_time.html#returns", + "href": "reference/pad_by_time.html#returns", + "title": "pad_by_time", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nDataFrame with added columns for CWT coefficients for each scale, with a real and imaginary column added." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function pad_by_time returns a Pandas DataFrame that has been extended with future dates." }, { - "objectID": "reference/augment_wavelet.html#notes", - "href": "reference/augment_wavelet.html#notes", - "title": "augment_wavelet", + "objectID": "reference/pad_by_time.html#notes", + "href": "reference/pad_by_time.html#notes", + "title": "pad_by_time", "section": "Notes", - "text": "Notes\nFor a detailed introduction to wavelet transforms, you can visit this website. https://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/\nThe Bump wavelet is a real-valued wavelet function, so its imaginary part is inherently zero.\nIn the continuous wavelet transform (CWT), the Morlet and Analytic Morlet wavelets are complex-valued, so their convolutions with the signal yield complex results (with both real and imaginary parts).\nWavelets, in general, are mathematical functions that can decompose a signal into its constituent parts at different scales. Different wavelet functions are suitable for different types of signals and analytical goals. Let’s look at the three wavelet methods:\n\nMorlet Wavelet:\nCharacteristics: Essentially a complex sinusoid modulated by a Gaussian window. It provides a good balance between time localization and frequency localization.\nWhen to use: When you want a good compromise between time and frequency localization. Particularly useful when you’re interested in sinusoidal components or oscillatory patterns of your data. Commonly used in time-frequency analysis because of its simplicity and effectiveness.\nBump Wavelet:\nCharacteristics: Has an oscillating behavior similar to the Morlet but has sharper time localization. Its frequency localization isn’t as sharp as its time localization.\nWhen to use: When you are more interested in precisely identifying when certain events or anomalies occur in your data. It can be especially useful for detecting sharp spikes or short-lived events in your signal.\nAnalytic Morlet Wavelet:\nCharacteristics: A variation of the Morlet wavelet that is designed to have no negative frequencies when transformed. This means it’s “analytic.” Offers slightly better frequency localization than the standard Morlet wavelet.\nWhen to use: When you’re interested in phase properties of your signal. Can be used when you need to avoid negative frequencies in your analysis, making it useful for certain types of signals, like analytic signals. Offers a cleaner spectrum in the frequency domain than the standard Morlet." + "text": "Notes" }, { - "objectID": "reference/augment_wavelet.html#examples", - "href": "reference/augment_wavelet.html#examples", - "title": "augment_wavelet", + "objectID": "reference/pad_by_time.html#performance", + "href": "reference/pad_by_time.html#performance", + "title": "pad_by_time", + "section": "Performance", + "text": "Performance\nThis function uses a number of techniques to speed up computation for large datasets with many time series groups.\n\nWe use a vectorized approach to generate the Cartesian product of all unique group values and all dates in the date range.\nWe then merge this Cartesian product with the original data to introduce NaN values for missing rows. This approach is much faster than looping through each group and applying a function to each group.\n\nNote: There is no parallel processing since the vectorized approach is almost always faster." + }, + { + "objectID": "reference/pad_by_time.html#examples", + "href": "reference/pad_by_time.html#examples", + "title": "pad_by_time", "section": "Examples", - "text": "Examples\n\n# Example 1: Using Pandas Engine on a pandas groupby object\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.datasets.load_dataset('walmart_sales_weekly', parse_dates = ['Date'])\n\nwavelet_df = (\n df\n .groupby('id')\n .augment_wavelet(\n date_column = 'Date',\n value_column ='Weekly_Sales', \n scales = [15],\n sample_rate =1,\n method = 'bump'\n )\n )\nwavelet_df.head()\n\n\n\n\n\n\n\n\nid\nStore\nDept\nDate\nWeekly_Sales\nIsHoliday\nType\nSize\nTemperature\nFuel_Price\nMarkDown1\nMarkDown2\nMarkDown3\nMarkDown4\nMarkDown5\nCPI\nUnemployment\nbump_scale_15_real\nbump_scale_15_imag\n\n\n\n\n0\n1_1\n1\n1\n2010-02-05\n24924.50\nFalse\nA\n151315\n42.31\n2.572\nNaN\nNaN\nNaN\nNaN\nNaN\n211.096358\n8.106\n28340.714927\n0.0\n\n\n1\n1_1\n1\n1\n2010-02-12\n46039.49\nTrue\nA\n151315\n38.51\n2.548\nNaN\nNaN\nNaN\nNaN\nNaN\n211.242170\n8.106\n32377.869306\n0.0\n\n\n2\n1_1\n1\n1\n2010-02-19\n41595.55\nFalse\nA\n151315\n39.93\n2.514\nNaN\nNaN\nNaN\nNaN\nNaN\n211.289143\n8.106\n36178.125507\n0.0\n\n\n3\n1_1\n1\n1\n2010-02-26\n19403.54\nFalse\nA\n151315\n46.63\n2.561\nNaN\nNaN\nNaN\nNaN\nNaN\n211.319643\n8.106\n39635.989442\n0.0\n\n\n4\n1_1\n1\n1\n2010-03-05\n21827.90\nFalse\nA\n151315\n46.50\n2.625\nNaN\nNaN\nNaN\nNaN\nNaN\n211.350143\n8.106\n42668.587553\n0.0\n\n\n\n\n\n\n\n\n# Example 2: Using Pandas Engine on a pandas dataframe\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('taylor_30_min', parse_dates = ['date'])\n\nresult_df = (\n tk.augment_wavelet(\n df, \n date_column = 'date',\n value_column ='value', \n scales = [15],\n sample_rate =1000,\n method = 'morlet'\n )\n)\n\nresult_df\n\n\n\n\n\n\n\n\ndate\nvalue\nmorlet_scale_15_real\nmorlet_scale_15_imag\n\n\n\n\n0\n2000-06-05 00:00:00+00:00\n22262\n5.858392e+07\n1.247285e+07\n\n\n1\n2000-06-05 00:30:00+00:00\n21756\n5.860706e+07\n1.246976e+07\n\n\n2\n2000-06-05 01:00:00+00:00\n22247\n5.862956e+07\n1.246639e+07\n\n\n3\n2000-06-05 01:30:00+00:00\n22759\n5.865217e+07\n1.246305e+07\n\n\n4\n2000-06-05 02:00:00+00:00\n22549\n5.867501e+07\n1.245981e+07\n\n\n...\n...\n...\n...\n...\n\n\n4027\n2000-08-27 21:30:00+00:00\n27946\n5.712707e+07\n-1.215821e+07\n\n\n4028\n2000-08-27 22:00:00+00:00\n27133\n5.709846e+07\n-1.215851e+07\n\n\n4029\n2000-08-27 22:30:00+00:00\n25996\n5.706991e+07\n-1.215882e+07\n\n\n4030\n2000-08-27 23:00:00+00:00\n24610\n5.704229e+07\n-1.215955e+07\n\n\n4031\n2000-08-27 23:30:00+00:00\n23132\n5.701639e+07\n-1.216105e+07\n\n\n\n\n4032 rows × 4 columns" + "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset('stocks_daily', parse_dates = ['date'])\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# Pad Single Time Series: Fill missing dates\npadded_df = (\n df\n .query('symbol == \"AAPL\"')\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n )\n)\npadded_df \n\n\n\n\n\n\n\n\ndate\nsymbol\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\n2013-01-02\nAAPL\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n1\n2013-01-03\nAAPL\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n2\n2013-01-04\nAAPL\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n3\n2013-01-05\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\n2013-01-06\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n3910\n2023-09-17\nAAPL\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n3911\n2023-09-18\nAAPL\n176.479996\n179.380005\n176.169998\n177.970001\n67257600.0\n177.970001\n\n\n3912\n2023-09-19\nAAPL\n177.520004\n179.630005\n177.130005\n179.070007\n51826900.0\n179.070007\n\n\n3913\n2023-09-20\nAAPL\n179.259995\n179.699997\n175.399994\n175.490005\n58436200.0\n175.490005\n\n\n3914\n2023-09-21\nAAPL\n174.550003\n176.300003\n173.860001\n173.929993\n63047900.0\n173.929993\n\n\n\n\n3915 rows × 8 columns\n\n\n\n\n# Pad by Group: Pad each group with missing dates\npadded_df = (\n df\n .groupby('symbol')\n .pad_by_time(\n date_column = 'date',\n freq = 'D'\n )\n)\npadded_df\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n1\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n2\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n3\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n4\nAAPL\n2013-01-06\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n23485\nNVDA\n2023-09-17\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n23486\nNVDA\n2023-09-18\n427.480011\n442.420013\n420.000000\n439.660004\n50027100.0\n439.660004\n\n\n23487\nNVDA\n2023-09-19\n438.329987\n439.660004\n430.019989\n435.200012\n37306400.0\n435.200012\n\n\n23488\nNVDA\n2023-09-20\n436.000000\n439.029999\n422.230011\n422.390015\n36710800.0\n422.390015\n\n\n23489\nNVDA\n2023-09-21\n415.829987\n421.000000\n409.799988\n410.170013\n44893000.0\n410.170013\n\n\n\n\n23490 rows × 8 columns\n\n\n\n\n# Pad with end dates specified\npadded_df = (\n df\n .groupby('symbol')\n .pad_by_time(\n date_column = 'date',\n freq = 'D',\n start_date = '2013-01-01',\n end_date = '2023-09-22'\n )\n)\npadded_df.query('symbol == \"AAPL\"')\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nAAPL\n2013-01-01\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n1\nAAPL\n2013-01-02\n19.779285\n19.821428\n19.343929\n19.608213\n560518000.0\n16.791180\n\n\n2\nAAPL\n2013-01-03\n19.567142\n19.631071\n19.321428\n19.360714\n352965200.0\n16.579241\n\n\n3\nAAPL\n2013-01-04\n19.177500\n19.236786\n18.779642\n18.821428\n594333600.0\n16.117437\n\n\n4\nAAPL\n2013-01-05\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n3912\nAAPL\n2023-09-18\n176.479996\n179.380005\n176.169998\n177.970001\n67257600.0\n177.970001\n\n\n3913\nAAPL\n2023-09-19\n177.520004\n179.630005\n177.130005\n179.070007\n51826900.0\n179.070007\n\n\n3914\nAAPL\n2023-09-20\n179.259995\n179.699997\n175.399994\n175.490005\n58436200.0\n175.490005\n\n\n3915\nAAPL\n2023-09-21\n174.550003\n176.300003\n173.860001\n173.929993\n63047900.0\n173.929993\n\n\n3916\nAAPL\n2023-09-22\nNaN\nNaN\nNaN\nNaN\nNaN\nNaN\n\n\n\n\n3917 rows × 8 columns" }, { - "objectID": "reference/get_available_datasets.html", - "href": "reference/get_available_datasets.html", - "title": "get_available_datasets", + "objectID": "reference/plot_anomalies_cleaned.html", + "href": "reference/plot_anomalies_cleaned.html", + "title": "plot_anomalies_cleaned", "section": "", - "text": "get_available_datasets()\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\nThe get_available_datasets function returns a sorted list of available dataset names from the pytimetk.datasets module. The available datasets are:" + "text": "plot_anomalies_cleaned(data, date_column, facet_ncol=1, line_color='#2c3e50', line_color_cleaned='#e31a1c', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomalies Cleaned Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_cleaned function takes in data from the anomalize() function, and returns a plot of the anomalies cleaned." }, { - "objectID": "reference/get_available_datasets.html#returns", - "href": "reference/get_available_datasets.html#returns", - "title": "get_available_datasets", + "objectID": "reference/plot_anomalies_cleaned.html#parameters", + "href": "reference/plot_anomalies_cleaned.html#parameters", + "title": "plot_anomalies_cleaned", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nfacet_ncol\nint\nThe number of columns in the facet grid. It is an optional parameter,\n1\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_color_cleaned\nstr\nThe color of the line in the plot. It is specified as a hexadecimal or a matplotlib color name. The default value is “#e31a1c”.\n'#e31a1c'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomalies Cleaned Plot” by default.\n'Anomalies Cleaned Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" + }, + { + "objectID": "reference/plot_anomalies_cleaned.html#returns", + "href": "reference/plot_anomalies_cleaned.html#returns", + "title": "plot_anomalies_cleaned", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist\nThe function get_available_datasets returns a sorted list of available dataset names from the pytimetk.datasets module." + "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." }, { - "objectID": "reference/get_available_datasets.html#examples", - "href": "reference/get_available_datasets.html#examples", - "title": "get_available_datasets", + "objectID": "reference/plot_anomalies_cleaned.html#see-also", + "href": "reference/plot_anomalies_cleaned.html#see-also", + "title": "plot_anomalies_cleaned", + "section": "See Also", + "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." + }, + { + "objectID": "reference/plot_anomalies_cleaned.html#examples", + "href": "reference/plot_anomalies_cleaned.html#examples", + "title": "plot_anomalies_cleaned", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\n\ntk.get_available_datasets()\n\n['bike_sales_sample',\n 'bike_sharing_daily',\n 'expedia',\n 'm4_daily',\n 'm4_hourly',\n 'm4_monthly',\n 'm4_quarterly',\n 'm4_weekly',\n 'm4_yearly',\n 'stocks_daily',\n 'taylor_30_min',\n 'walmart_sales_weekly',\n 'wikipedia_traffic_daily']" + "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results\nanomalize_df.plot_anomalies_cleaned(\"date\")\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_cleaned(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 600,\n height = 1000,\n x_axis_date_labels = \"%y\",\n engine = 'plotly', \n )\n)" }, { - "objectID": "reference/is_holiday.html", - "href": "reference/is_holiday.html", - "title": "is_holiday", + "objectID": "reference/plot_anomaly_decomp.html", + "href": "reference/plot_anomaly_decomp.html", + "title": "plot_anomalies_decomp", "section": "", - "text": "is_holiday(idx, country_name='UnitedStates', country=None, engine='pandas')\nCheck if a given list of dates are holidays for a specified country.\nNote: This function requires the holidays package to be installed." + "text": "plot_anomalies_decomp(data, date_column, line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomaly Decomposition Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_decomp function takes in data from the anomalize() function, and returns a plot of the anomaly decomposition." }, { - "objectID": "reference/is_holiday.html#parameters", - "href": "reference/is_holiday.html#parameters", - "title": "is_holiday", + "objectID": "reference/plot_anomaly_decomp.html#parameters", + "href": "reference/plot_anomaly_decomp.html#parameters", + "title": "plot_anomalies_decomp", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\nUnion[str, datetime, List[Union[str, datetime]], pd.Series]\nThe dates to check for holiday status.\nrequired\n\n\ncountry_name\nstr\nThe name of the country for which to check the holiday status. Defaults to ‘UnitedStates’ if not specified.\n'UnitedStates'\n\n\ncountry\nstr\nAn alternative parameter to specify the country for holiday checking, overriding country_name.\nNone\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating the boolean series. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating a boolean of holidays or not holidays. This can be faster than using “pandas” for long series.\n'pandas'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomaly Decomposition Plot” by default.\n'Anomaly Decomposition Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" }, { - "objectID": "reference/is_holiday.html#returns", - "href": "reference/is_holiday.html#returns", - "title": "is_holiday", - "section": "Returns:", - "text": "Returns:\npd.Series: Series containing True if the date is a holiday, False otherwise." + "objectID": "reference/plot_anomaly_decomp.html#returns", + "href": "reference/plot_anomaly_decomp.html#returns", + "title": "plot_anomalies_decomp", + "section": "Returns", + "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." }, { - "objectID": "reference/is_holiday.html#raises", - "href": "reference/is_holiday.html#raises", - "title": "is_holiday", - "section": "Raises:", - "text": "Raises:\nValueError: If the specified country is not found in the holidays package." + "objectID": "reference/plot_anomaly_decomp.html#see-also", + "href": "reference/plot_anomaly_decomp.html#see-also", + "title": "plot_anomalies_decomp", + "section": "See Also", + "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." }, { - "objectID": "reference/is_holiday.html#examples", - "href": "reference/is_holiday.html#examples", - "title": "is_holiday", - "section": "Examples:", - "text": "Examples:\n\nimport polars as pl\nimport pytimetk as tk\n\ntk.is_holiday('2023-01-01', country_name='UnitedStates')\n\n0 True\nName: is_holiday, dtype: bool\n\n\n\n# List of dates\ntk.is_holiday(['2023-01-01', '2023-01-02', '2023-01-03'], country_name='UnitedStates')\n\n0 True\n1 True\n2 False\nName: is_holiday, dtype: bool\n\n\n\n# Polars Series\ntk.is_holiday(pl.Series(['2023-01-01', '2023-01-02', '2023-01-03']), country_name='UnitedStates')\n\n0 True\n1 True\n2 False\nName: is_holiday, dtype: bool" + "objectID": "reference/plot_anomaly_decomp.html#examples", + "href": "reference/plot_anomaly_decomp.html#examples", + "title": "plot_anomalies_decomp", + "section": "Examples", + "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results\nanomalize_df.plot_anomalies_decomp(\"date\")\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotnine', \n )\n)\n\n\n\n\n\n\n\n<Figure Size: (1200 x 800)>" }, { - "objectID": "reference/get_date_summary.html", - "href": "reference/get_date_summary.html", - "title": "get_date_summary", + "objectID": "reference/get_seasonal_frequency.html", + "href": "reference/get_seasonal_frequency.html", + "title": "get_seasonal_frequency", "section": "", - "text": "get_date_summary(idx, engine='pandas')\nReturns a summary of the date-related information, including the number of dates, the time zone, the start date, and the end date." + "text": "get_seasonal_frequency(idx, force_regular=False, numeric=False, engine='pandas')\nThe get_seasonal_frequency function returns the seasonal period of a given time series or datetime index." }, { - "objectID": "reference/get_date_summary.html#parameters", - "href": "reference/get_date_summary.html#parameters", - "title": "get_date_summary", + "objectID": "reference/get_seasonal_frequency.html#parameters", + "href": "reference/get_seasonal_frequency.html#parameters", + "title": "get_seasonal_frequency", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DateTimeIndex\nThe parameter idx can be either a pandas Series or a pandas DateTimeIndex. It represents the dates or timestamps for which we want to generate a summary.\nrequired\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating a date summary.\n'pandas'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\nUnion[pd.Series, pd.DatetimeIndex]\nThe idx parameter can be either a pandas Series or a pandas DatetimeIndex. It represents the time index for which you want to calculate the seasonal frequency.\nrequired\n\n\nforce_regular\nbool\nforce_regular is a boolean parameter that determines whether to force the frequency to be regular. If set to True, the function will try to find a regular frequency even if the data is irregular. If set to False, the function will return the actual frequency of the data.\nFalse\n\n\nnumeric\nbool\nThe numeric parameter is a boolean flag that determines whether the output should be in numeric format or a string Pandas Frequency Alias. If numeric is set to True, the output will be a numeric representation of the seasonal period. If numeric is set to False (default), the output will\nFalse\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a date summary. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the time scale information.\n'pandas'" }, { - "objectID": "reference/get_date_summary.html#returns", - "href": "reference/get_date_summary.html#returns", - "title": "get_date_summary", + "objectID": "reference/get_seasonal_frequency.html#returns", + "href": "reference/get_seasonal_frequency.html#returns", + "title": "get_seasonal_frequency", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA pandas DataFrame with the following columns: - date_n: The number of dates in the index. - date_tz: The time zone of the dates in the index. - date_start: The first date in the index. - date_end: The last date in the index." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function get_seasonal_frequency returns the seasonal period based\non the input index. If the index is a pd.DatetimeIndex, it is converted to a pd.Series with the name “idx”. The function then calculates the summary frequency of the index using the get_frequency_summary function. It determines the scale and unit of the frequency and adjusts the unit if the scale is" }, { - "objectID": "reference/get_date_summary.html#examples", - "href": "reference/get_date_summary.html#examples", - "title": "get_date_summary", + "objectID": "reference/get_seasonal_frequency.html#examples", + "href": "reference/get_seasonal_frequency.html#examples", + "title": "get_seasonal_frequency", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n\ntk.get_date_summary(df['order_date'], engine='pandas')\n\ntk.get_date_summary(df['order_date'], engine='polars')\n\n\n\n\n\n\n\n\ndate_n\ndate_tz\ndate_start\ndate_end\n\n\n\n\n0\n2466\nNone\n2011-01-07\n2011-12-28" + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndates = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\ntk.get_seasonal_frequency(dates)\n\n'1Y'" }, { - "objectID": "reference/palette_timetk.html", - "href": "reference/palette_timetk.html", - "title": "palette_timetk", + "objectID": "reference/plot_correlation_funnel.html", + "href": "reference/plot_correlation_funnel.html", + "title": "plot_correlation_funnel", "section": "", - "text": "palette_timetk()\nThe function palette_timetk returns a dictionary of color codes for various colors in the timetk theme." + "text": "plot_correlation_funnel(data, limits=(-1, 1), alpha=1.0, title='Correlation Funnel Plot', x_lab='Correlation', y_lab='Feature', base_size=11, width=None, height=None, engine='plotly')\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or plotnine in Python." }, { - "objectID": "reference/palette_timetk.html#returns", - "href": "reference/palette_timetk.html#returns", - "title": "palette_timetk", + "objectID": "reference/plot_correlation_funnel.html#parameters", + "href": "reference/plot_correlation_funnel.html#parameters", + "title": "plot_correlation_funnel", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame that contains the correlation values and corresponding features. It should have two columns: ‘correlation’ and ‘feature’.\nrequired\n\n\nlimits\ntuple\nThe limits parameter is a tuple that specifies the lower and upper limits of the x-axis in the correlation funnel plot. By default, the limits are set to (-1, 1), which means the x-axis will range from -1 to 1.\n(-1, 1)\n\n\nalpha\nfloat\nThe alpha parameter determines the transparency of the data points in the plot. A value of 1.0 means the points are fully opaque, while a value less than 1.0 makes the points more transparent.\n1.0\n\n\ntitle\nstr\nThe title of the plot.\n'Correlation Funnel Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It represents the label for the correlation values.\n'Correlation'\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis in the correlation funnel plot. It represents the name or description of the feature being plotted.\n'Feature'\n\n\nbase_size\nfloat\nThe base_size parameter is used to set the base font size for the plot. It is multiplied by different factors to determine the font sizes for various elements of the plot, such as the title, axis labels, tick labels, legend, and annotations.\n11\n\n\nwidth\nOptional[int]\nThe width parameter is used to specify the width of the plot in pixels. It determines the horizontal size of the plot.\nNone\n\n\nheight\nOptional[int]\nThe height parameter is used to specify the height of the plot in pixels. It determines the vertical size of the plot when it is rendered.\nNone\n\n\nengine\nstr\nThe engine parameter determines the plotting engine to be used. It can be set to either “plotly” or “plotnine”. If set to “plotly”, the function will generate an interactive plot using the Plotly library. If set to “plotnine”, it will generate a static plot using the plotnine library. The default value is “plotly”.\n'plotly'" + }, + { + "objectID": "reference/plot_correlation_funnel.html#returns", + "href": "reference/plot_correlation_funnel.html#returns", + "title": "plot_correlation_funnel", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function palette_timetk returns a dictionary containing color\nnames as keys and their corresponding hexadecimal color codes as values:" + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function plot_correlation_funnel returns a plotly figure object if the engine parameter is\nset to ‘plotly’, and a plotnine object if the engine parameter is set to ‘plotnine’." }, { - "objectID": "reference/palette_timetk.html#examples", - "href": "reference/palette_timetk.html#examples", - "title": "palette_timetk", + "objectID": "reference/plot_correlation_funnel.html#see-also", + "href": "reference/plot_correlation_funnel.html#see-also", + "title": "plot_correlation_funnel", + "section": "See Also", + "text": "See Also\n\nbinarize(): Binarize the dataset into 1’s and 0’s.\ncorrelate(): Calculate the correlation between features in a pandas DataFrame." + }, + { + "objectID": "reference/plot_correlation_funnel.html#examples", + "href": "reference/plot_correlation_funnel.html#examples", + "title": "plot_correlation_funnel", "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\n\ntk.palette_timetk()\n\n{'blue': '#2c3e50',\n 'red': '#e31a1c',\n 'green': '#18BC9C',\n 'yellow': '#CCBE93',\n 'steel_blue': '#a6cee3',\n 'navy_blue': '#1f78b4',\n 'light_green': '#b2df8a',\n 'pink': '#fb9a99',\n 'light_orange': '#fdbf6f',\n 'orange': '#ff7f00',\n 'light_purple': '#cab2d6',\n 'purple': '#6a3d9a'}" + "text": "Examples\n\n# NON-TIMESERIES EXAMPLE ----\n\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\n# Set a random seed for reproducibility\nnp.random.seed(0)\n\n# Define the number of rows for your DataFrame\nnum_rows = 200\n\n# Create fake data for the columns\ndata = {\n 'Age': np.random.randint(18, 65, size=num_rows),\n 'Gender': np.random.choice(['Male', 'Female'], size=num_rows),\n 'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),\n 'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),\n 'Years_Playing': np.random.randint(0, 30, size=num_rows),\n 'Average_Income': np.random.randint(20000, 100000, size=num_rows),\n 'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),\n 'Number_Children': np.random.randint(0, 5, size=num_rows),\n 'Own_House_Flag': np.random.choice([True, False], size=num_rows),\n 'Own_Car_Count': np.random.randint(0, 3, size=num_rows),\n 'PersonId': range(1, num_rows + 1), # Add a PersonId column as a row count\n 'Client': np.random.choice(['A', 'B'], size=num_rows) # Add a Client column with random values 'A' or 'B'\n}\n\n# Create a DataFrame\ndf = pd.DataFrame(data)\n\n# Binarize the data\ndf_binarized = df.binarize(n_bins=4, thresh_infreq=0.01, name_infreq=\"-OTHER\", one_hot=True)\n\ndf_binarized.glimpse() \n\n[]\n<class 'pandas.core.frame.DataFrame'>: 200 rows of 42 columns\nAge__18.0_29.5: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nAge__29.5_41.0: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nAge__41.0_52.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nAge__52.5_64.0: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nYears_Playing__0.0_7.2: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nYears_Playing__7.2_14.5: uint8 [0, 0, 1, 0, 1, 0, 1 ...\nYears_Playing__14.5_21.8: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nYears_Playing__21.8_29.0: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nAverage_Income__20131.0_39881.0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nAverage_Income__39881.0_59631.0: uint8 [0, 0, 0, 1, 1, 0, 1 ...\nAverage_Income__59631.0_79381.0: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nAverage_Income__79381.0_99210.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nPersonId__1.0_50.8: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nPersonId__50.8_100.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__100.5_150.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__150.2_200.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nGender__Female: uint8 [1, 0, 0, 0, 1, 0, 1 ...\nGender__Male: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nMarital_Status__Divorced: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nMarital_Status__Married: uint8 [1, 1, 0, 0, 1, 0, 0 ...\nMarital_Status__Single: uint8 [0, 0, 1, 1, 0, 1, 1 ...\nCity__Chicago: uint8 [0, 0, 1, 0, 0, 1, 0 ...\nCity__Houston: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nCity__Los Angeles: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nCity__Miami: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nCity__New York: uint8 [1, 0, 0, 1, 1, 0, 0 ...\nMember_Status__Bronze: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nMember_Status__Gold: uint8 [0, 0, 0, 0, 0, 1, 1 ...\nMember_Status__Platinum: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nMember_Status__Silver: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nNumber_Children__0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nNumber_Children__1: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nNumber_Children__2: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nNumber_Children__3: uint8 [0, 1, 0, 0, 0, 1, 0 ...\nNumber_Children__4: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nOwn_House_Flag__0: uint8 [1, 1, 0, 0, 1, 0, 1 ...\nOwn_House_Flag__1: uint8 [0, 0, 1, 1, 0, 1, 0 ...\nOwn_Car_Count__0: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nOwn_Car_Count__1: uint8 [0, 0, 0, 1, 0, 1, 1 ...\nOwn_Car_Count__2: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nClient__A: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nClient__B: uint8 [0, 0, 0, 0, 0, 0, 0 ...\n\n\n\ndf_correlated = df_binarized.correlate(target='Member_Status__Platinum')\ndf_correlated.head(10)\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n28\nMember_Status\nPlatinum\n1.000000\n\n\n26\nMember_Status\nBronze\n-0.341351\n\n\n29\nMember_Status\nSilver\n-0.332799\n\n\n27\nMember_Status\nGold\n-0.298637\n\n\n30\nNumber_Children\n0\n0.205230\n\n\n8\nAverage_Income\n20131.0_39881.0\n-0.151215\n\n\n0\nAge\n18.0_29.5\n-0.135522\n\n\n11\nAverage_Income\n79381.0_99210.0\n0.128508\n\n\n33\nNumber_Children\n3\n-0.112216\n\n\n9\nAverage_Income\n39881.0_59631.0\n0.109999\n\n\n\n\n\n\n\n\n# Interactive\ndf_correlated.plot_correlation_funnel(\n engine='plotly', \n height=600\n)\n\n\n \n\n\n\n# Static\ndf_correlated.plot_correlation_funnel(\n engine ='plotnine', \n height = 900\n)\n\n\n\n\n<Figure Size: (700 x 900)>" }, { - "objectID": "reference/get_pandas_frequency.html", - "href": "reference/get_pandas_frequency.html", - "title": "get_pandas_frequency", + "objectID": "reference/glimpse.html", + "href": "reference/glimpse.html", + "title": "glimpse", "section": "", - "text": "get_pandas_frequency(idx, force_regular=False)\nGet the frequency of a pandas Series or DatetimeIndex.\nThe function get_pandas_frequency takes a Pandas Series or DatetimeIndex as input and returns the inferred frequency of the index, with an option to force regular frequency.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter can be either a pd.Series or a pd.DatetimeIndex. It represents the index or the time series data for which we want to determine the frequency.\nrequired\n\n\nforce_regular\nbool\nThe force_regular parameter is a boolean flag that determines whether to force the frequency to be regular. If set to True, the function will convert irregular frequencies to their regular counterparts. For example, if the inferred frequency is ‘B’ (business days), it will be converted to ‘D’ (calendar days). The default value is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe frequency of the given pandas series or datetime index." + "text": "glimpse(data, max_width=76, engine='pandas')\nTakes a pandas DataFrame and prints a summary of its dimensions, column names, data types, and the first few values of each column." }, { - "objectID": "reference/get_pandas_frequency.html#parameters", - "href": "reference/get_pandas_frequency.html#parameters", - "title": "get_pandas_frequency", - "section": "", - "text": "Name\nType\nDescription\nDefault\n\n\n\n\nidx\npd.Series or pd.DatetimeIndex\nThe idx parameter can be either a pd.Series or a pd.DatetimeIndex. It represents the index or the time series data for which we want to determine the frequency.\nrequired\n\n\nforce_regular\nbool\nThe force_regular parameter is a boolean flag that determines whether to force the frequency to be regular. If set to True, the function will convert irregular frequencies to their regular counterparts. For example, if the inferred frequency is ‘B’ (business days), it will be converted to ‘D’ (calendar days). The default value is False.\nFalse" + "objectID": "reference/glimpse.html#parameters", + "href": "reference/glimpse.html#parameters", + "title": "glimpse", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame that contains the data you want to glimpse at. It is the main input to the glimpse function.\nrequired\n\n\nmax_width\nint\nThe max_width parameter is an optional parameter that specifies the maximum width of each line when printing the glimpse of the DataFrame. If not provided, the default value is set to 76.\n76\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for generating a glimpse. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for generating the glimpse.\n'pandas'" }, { - "objectID": "reference/get_pandas_frequency.html#returns", - "href": "reference/get_pandas_frequency.html#returns", - "title": "get_pandas_frequency", - "section": "", - "text": "Type\nDescription\n\n\n\n\nstr\nThe frequency of the given pandas series or datetime index." + "objectID": "reference/glimpse.html#examples", + "href": "reference/glimpse.html#examples", + "title": "glimpse", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('walmart_sales_weekly', parse_dates=['Date'])\n\ndf.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 1001 rows of 17 columns\nid: object ['1_1', '1_1', '1_1', '1_1', '1_1', '1_ ...\nStore: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDept: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nDate: datetime64[ns] [Timestamp('2010-02-05 00:00:00'), Time ...\nWeekly_Sales: float64 [24924.5, 46039.49, 41595.55, 19403.54, ...\nIsHoliday: bool [False, True, False, False, False, Fals ...\nType: object ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A' ...\nSize: int64 [151315, 151315, 151315, 151315, 151315 ...\nTemperature: float64 [42.31, 38.51, 39.93, 46.63, 46.5, 57.7 ...\nFuel_Price: float64 [2.572, 2.548, 2.514, 2.561, 2.625, 2.6 ...\nMarkDown1: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown2: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown3: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown4: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nMarkDown5: float64 [nan, nan, nan, nan, nan, nan, nan, nan ...\nCPI: float64 [211.0963582, 211.2421698, 211.2891429, ...\nUnemployment: float64 [8.106, 8.106, 8.106, 8.106, 8.106, 8.1 ..." }, { - "objectID": "reference/index.html", - "href": "reference/index.html", - "title": "Function reference", + "objectID": "reference/augment_ppo.html", + "href": "reference/augment_ppo.html", + "title": "augment_ppo", "section": "", - "text": "Visualize time series data with one line of code.\n\n\n\nplot_timeseries\nCreates time series plots using different plotting engines such as Plotnine,\n\n\n\n\n\n\nBend time series data to your will.\n\n\n\nsummarize_by_time\nSummarize a DataFrame or GroupBy object by time.\n\n\napply_by_time\nApply for time series.\n\n\npad_by_time\nMake irregular time series regular by padding with missing dates.\n\n\nfilter_by_time\nFilters a DataFrame or GroupBy object based on a specified date range.\n\n\nfuture_frame\nExtend a DataFrame or GroupBy object with future dates.\n\n\n\n\n\n\nDetect anomalies in time series data.\n\n\n\nanomalize\nDetects anomalies in time series data, either for a single time\n\n\nplot_anomalies\nCreates plot of anomalies in time series data using Plotly, Matplotlib,\n\n\nplot_anomalies_decomp\nThe plot_anomalies_decomp function takes in data from the anomalize()\n\n\nplot_anomalies_cleaned\nThe plot_anomalies_cleaned function takes in data from the anomalize()\n\n\n\n\n\n\nVisualize correlation on any tabular dataset (not just for Time Series).\n\n\n\nbinarize\nThe binarize function prepares data for correlate, which is used for analyzing correlationfunnel plots.\n\n\ncorrelate\nThe correlate function calculates the correlation between a target variable and all other\n\n\nplot_correlation_funnel\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or\n\n\n\n\n\n\nAdding Features to Time Series DataFrames (Augmenting)\n\n\n\naugment_timeseries_signature\nThe function augment_timeseries_signature takes a DataFrame and a date\n\n\naugment_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\naugment_lags\nAdds lags to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_leads\nAdds leads to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_diffs\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_pct_change\nAdds percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_rolling\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame.\n\n\naugment_rolling_apply\nApply one or more DataFrame-based rolling functions and window sizes to one\n\n\naugment_expanding\nApply one or more Series-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_expanding_apply\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_ewm\nAdd Exponential Weighted Moving (EWM) window functions to a DataFrame or\n\n\naugment_fourier\nAdds Fourier transforms to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_hilbert\nApply the Hilbert transform to specified columns of a DataFrame or\n\n\naugment_wavelet\nApply the Wavely transform to specified columns of a DataFrame or\n\n\n\n\n\n\nPython implementation of the R package tsfeatures.\n\n\n\nts_features\nExtracts aggregated time series features from a DataFrame or DataFrameGroupBy object using the tsfeatures package.\n\n\nts_summary\nComputes summary statistics for a time series data, either for the entire\n\n\n\n\n\n\nTime series cross validation.\n\n\n\nTimeSeriesCV\nTimeSeriesCV is a subclass of TimeBasedSplit with default mode set to ‘backward’\n\n\n\n\n\n\nMomentum indicators for financial time series data.\n\n\n\naugment_macd\nCalculate MACD for a given financial instrument using either pandas or polars engine.\n\n\naugment_ppo\nCalculate PPO for a given financial instrument using either pandas or polars engine.\n\n\naugment_rsi\nThe augment_rsi function calculates the Relative Strength Index (RSI) for a given financial\n\n\naugment_cmo\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial\n\n\naugment_roc\nAdds rate of change (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_qsmomentum\nThe function augment_qsmomentum calculates Quant Science Momentum for financial data.\n\n\n\n\n\n\nVolatility indicators for financial time series data.\n\n\n\naugment_bbands\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return\n\n\naugment_atr\nThe augment_atr function is used to calculate Average True Range (ATR) and\n\n\n\n\n\n\nTime series functions that generate / manipulate Pandas Series.\n\n\n\nmake_future_timeseries\nMake future dates for a time series.\n\n\nmake_weekday_sequence\nGenerate a sequence of weekday dates within a specified date range,\n\n\nmake_weekend_sequence\nGenerate a sequence of weekend dates within a specified date range,\n\n\nget_date_summary\nReturns a summary of the date-related information, including the number of\n\n\nget_frequency_summary\nMore robust version of pandas inferred frequency.\n\n\nget_diff_summary\nCalculates summary statistics of the time differences between consecutive values in a datetime index.\n\n\nget_frequency\nGet the frequency of a pandas Series or DatetimeIndex.\n\n\nget_seasonal_frequency\nThe get_seasonal_frequency function returns the seasonal period of a given\n\n\nget_trend_frequency\nThe get_trend_frequency function returns the trend period of a given time\n\n\nget_timeseries_signature\nConvert a timestamp to a set of 29 time series features.\n\n\nget_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\n\n\n\n\nHelper functions to make your life easier.\n\n\n\nfloor_date\nRobust date flooring.\n\n\nceil_date\nRobust date ceiling.\n\n\nis_holiday\nCheck if a given list of dates are holidays for a specified country.\n\n\nweek_of_month\nThe “week_of_month” function calculates the week number of a given date\n\n\ntimeseries_unit_frequency_table\nThe function timeseries_unit_frequency_table returns a pandas DataFrame\n\n\ntime_scale_template\nThe function time_scale_template returns a table with time scale\n\n\n\n\n\n\nHelper functions to make your life easier.\n\n\n\ntheme_timetk\nReturns a plotnine theme with timetk styles applied, allowing for\n\n\npalette_timetk\nThe function palette_timetk returns a dictionary of color codes for\n\n\n\n\n\n\n\n\n\nglimpse\nTakes a pandas DataFrame and prints a summary of its dimensions, column\n\n\nparallel_apply\nThe parallel_apply function parallelizes the application of a function on\n\n\nprogress_apply\nAdds a progress bar to pandas apply().\n\n\ndrop_zero_variance\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with\n\n\ntransform_columns\nThe function transform_columns applies a user-provided function to specified columns in a pandas DataFrame.\n\n\nflatten_multiindex_column_names\nTakes a DataFrame as input and flattens the column\n\n\n\n\n\n\nPractice pytimetk with 13 complementary time series datasets.\n\n\n\nget_available_datasets\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\n\n\nload_dataset\nLoad one of 12 Time Series Datasets." + "text": "augment_ppo(data, date_column, close_column, fast_period=12, slow_period=26, reduce_memory=False, engine='pandas')\nCalculate PPO for a given financial instrument using either pandas or polars engine." }, { - "objectID": "reference/index.html#data-visualization", - "href": "reference/index.html#data-visualization", - "title": "Function reference", - "section": "", - "text": "Visualize time series data with one line of code.\n\n\n\nplot_timeseries\nCreates time series plots using different plotting engines such as Plotnine," + "objectID": "reference/augment_ppo.html#parameters", + "href": "reference/augment_ppo.html#parameters", + "title": "augment_ppo", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nPandas DataFrame or GroupBy object containing financial data.\nrequired\n\n\ndate_column\nstr\nName of the column containing date information.\nrequired\n\n\nclose_column\nstr\nName of the column containing closing price data.\nrequired\n\n\nfast_period\nint\nNumber of periods for the fast EMA in PPO calculation.\n12\n\n\nslow_period\nint\nNumber of periods for the slow EMA in PPO calculation.\n26\n\n\nreduce_memory\nbool\nWhether to reduce memory usage of the data before performing the calculation.\nFalse\n\n\nengine\nstr\nComputation engine to use (‘pandas’ or ‘polars’).\n'pandas'" }, { - "objectID": "reference/index.html#wrangling-pandas-time-series-dataframes", - "href": "reference/index.html#wrangling-pandas-time-series-dataframes", - "title": "Function reference", - "section": "", - "text": "Bend time series data to your will.\n\n\n\nsummarize_by_time\nSummarize a DataFrame or GroupBy object by time.\n\n\napply_by_time\nApply for time series.\n\n\npad_by_time\nMake irregular time series regular by padding with missing dates.\n\n\nfilter_by_time\nFilters a DataFrame or GroupBy object based on a specified date range.\n\n\nfuture_frame\nExtend a DataFrame or GroupBy object with future dates." + "objectID": "reference/augment_ppo.html#returns", + "href": "reference/augment_ppo.html#returns", + "title": "augment_ppo", + "section": "Returns", + "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nDataFrame with PPO values added." + }, + { + "objectID": "reference/augment_ppo.html#notes", + "href": "reference/augment_ppo.html#notes", + "title": "augment_ppo", + "section": "Notes", + "text": "Notes\nThe Percentage Price Oscillator (PPO) is a momentum oscillator that measures the difference between two moving averages as a percentage of the larger moving average. The PPO is best used to confirm the direction of the price trend and gauge its momentum.\nThe PPO is calculated by subtracting a long-term EMA from a short-term EMA, then dividing the result by the long-term EMA, and finally multiplying by 100.\nAdvantages Over MACD: The PPO’s percentage-based calculation allows for easier comparisons between different securities, regardless of their price levels. This is a distinct advantage over the MACD, which provides absolute values and can be less meaningful when comparing stocks with significantly different prices." }, { - "objectID": "reference/index.html#anomaly-detection", - "href": "reference/index.html#anomaly-detection", - "title": "Function reference", - "section": "", - "text": "Detect anomalies in time series data.\n\n\n\nanomalize\nDetects anomalies in time series data, either for a single time\n\n\nplot_anomalies\nCreates plot of anomalies in time series data using Plotly, Matplotlib,\n\n\nplot_anomalies_decomp\nThe plot_anomalies_decomp function takes in data from the anomalize()\n\n\nplot_anomalies_cleaned\nThe plot_anomalies_cleaned function takes in data from the anomalize()" + "objectID": "reference/augment_ppo.html#examples", + "href": "reference/augment_ppo.html#examples", + "title": "augment_ppo", + "section": "Examples", + "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset(\"stocks_daily\", parse_dates = ['date'])\n\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# PPO pandas engine\ndf_ppo = (\n df\n .groupby('symbol')\n .augment_ppo(\n date_column = 'date', \n close_column = 'close', \n fast_period = 12, \n slow_period = 26, \n engine = \"pandas\"\n )\n)\n\ndf_ppo.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 9 columns\nsymbol: object ['AAPL', 'AAPL', 'AAPL', 'AAPL' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00:00:00 ...\nopen: float64 [19.779285430908203, 19.5671424 ...\nhigh: float64 [19.821428298950195, 19.6310710 ...\nlow: float64 [19.343929290771484, 19.3214282 ...\nclose: float64 [19.608213424682617, 19.3607139 ...\nvolume: int64 [560518000, 352965200, 59433360 ...\nadjusted: float64 [16.791179656982422, 16.5792407 ...\nclose_ppo_line_12_26: float64 [0.0, -0.10078442036791524, -0. ...\n\n\n\n# PPO polars engine\ndf_ppo = (\n df\n .groupby('symbol')\n .augment_ppo(\n date_column = 'date', \n close_column = 'close', \n fast_period = 12, \n slow_period = 26, \n engine = \"polars\"\n )\n)\n\ndf_ppo.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 9 columns\nsymbol: object ['META', 'META', 'META', 'META' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00:00:00 ...\nopen: float64 [27.440000534057617, 27.8799991 ...\nhigh: float64 [28.18000030517578, 28.46999931 ...\nlow: float64 [27.420000076293945, 27.5900001 ...\nclose: float64 [28.0, 27.770000457763672, 28.7 ...\nvolume: int64 [69846400, 63140600, 72715400, ...\nadjusted: float64 [28.0, 27.770000457763672, 28.7 ...\nclose_ppo_line_12_26: float64 [0.0, -0.06556683019189882, 0.1 ..." }, { - "objectID": "reference/index.html#correlation-funnel", - "href": "reference/index.html#correlation-funnel", - "title": "Function reference", + "objectID": "reference/augment_bbands.html", + "href": "reference/augment_bbands.html", + "title": "augment_bbands", "section": "", - "text": "Visualize correlation on any tabular dataset (not just for Time Series).\n\n\n\nbinarize\nThe binarize function prepares data for correlate, which is used for analyzing correlationfunnel plots.\n\n\ncorrelate\nThe correlate function calculates the correlation between a target variable and all other\n\n\nplot_correlation_funnel\nThe plot_correlation_funnel function generates a correlation funnel plot using either Plotly or" + "text": "augment_bbands(data, date_column, close_column, periods=20, std_dev=2, reduce_memory=False, engine='pandas')\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return the augmented dataset." }, { - "objectID": "reference/index.html#feature-engineereing", - "href": "reference/index.html#feature-engineereing", - "title": "Function reference", - "section": "", - "text": "Adding Features to Time Series DataFrames (Augmenting)\n\n\n\naugment_timeseries_signature\nThe function augment_timeseries_signature takes a DataFrame and a date\n\n\naugment_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries\n\n\naugment_lags\nAdds lags to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_leads\nAdds leads to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_diffs\nAdds differences and percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_pct_change\nAdds percentage difference (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_rolling\nApply one or more Series-based rolling functions and window sizes to one or more columns of a DataFrame.\n\n\naugment_rolling_apply\nApply one or more DataFrame-based rolling functions and window sizes to one\n\n\naugment_expanding\nApply one or more Series-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_expanding_apply\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame.\n\n\naugment_ewm\nAdd Exponential Weighted Moving (EWM) window functions to a DataFrame or\n\n\naugment_fourier\nAdds Fourier transforms to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_hilbert\nApply the Hilbert transform to specified columns of a DataFrame or\n\n\naugment_wavelet\nApply the Wavely transform to specified columns of a DataFrame or" + "objectID": "reference/augment_bbands.html#parameters", + "href": "reference/augment_bbands.html#parameters", + "title": "augment_bbands", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It contains the data on which the Bollinger Bands will be calculated.\nrequired\n\n\ndate_column\nstr\nThe date_column parameter is a string that specifies the name of the column in the data DataFrame that contains the dates.\nrequired\n\n\nclose_column\nstr\nThe close_column parameter is a string that specifies the name of the column in the data DataFrame that contains the closing prices of the asset.\nrequired\n\n\nperiods\nUnion[int, Tuple[int, int], List[int]]\nThe periods parameter in the augment_bbands function can be specified as an integer, a tuple, or a list. This parameter specifies the number of rolling periods to use when calculating the Bollinger Bands.\n20\n\n\nstd_dev\nfloat\nThe std_dev parameter is a float that represents the number of standard deviations to use when calculating the Bollinger Bands. Bollinger Bands are a technical analysis tool that consists of a middle band (usually a simple moving average) and an upper and lower band that are typically two standard deviations away from the middle band. The std_dev parameter specifies the number of standard deviations. std_dev can be a list of floats as well.\n2\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is a boolean flag that indicates whether or not to reduce the memory usage of the input data before performing the calculation. If set to True, the function will attempt to reduce the memory usage of the input data using techniques such as downcasting numeric columns and converting object columns\nFalse\n\n\nengine\nstr\nThe engine parameter specifies the computation engine to use for calculating the Bollinger Bands. It can take two values: ‘pandas’ or ‘polars’. If ‘pandas’ is selected, the function will use the pandas library for computation. If ‘polars’ is selected,\n'pandas'" }, { - "objectID": "reference/index.html#ts-features", - "href": "reference/index.html#ts-features", - "title": "Function reference", - "section": "", - "text": "Python implementation of the R package tsfeatures.\n\n\n\nts_features\nExtracts aggregated time series features from a DataFrame or DataFrameGroupBy object using the tsfeatures package.\n\n\nts_summary\nComputes summary statistics for a time series data, either for the entire" + "objectID": "reference/augment_bbands.html#returns", + "href": "reference/augment_bbands.html#returns", + "title": "augment_bbands", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function augment_bbands returns a pandas DataFrame." }, { - "objectID": "reference/index.html#time-series-cross-validation-tscv", - "href": "reference/index.html#time-series-cross-validation-tscv", - "title": "Function reference", - "section": "", - "text": "Time series cross validation.\n\n\n\nTimeSeriesCV\nTimeSeriesCV is a subclass of TimeBasedSplit with default mode set to ‘backward’" + "objectID": "reference/augment_bbands.html#notes", + "href": "reference/augment_bbands.html#notes", + "title": "augment_bbands", + "section": "Notes", + "text": "Notes\nBollinger Bands are a technical analysis tool developed by John Bollinger in the 1980s. They are used to measure the ‘volatility’ of a stock price or other financial instrument. This indicator consists of three lines which are plotted in relation to an asset’s price:\n\nThe Middle Band: This is typically a simple moving average (SMA) of the closing prices over a certain number of days (commonly 20 days).\nThe Upper Band: This is set a specified number of standard deviations (usually two) above the middle band.\nThe Lower Band: This is set the same number of standard deviations (again, usually two) below the middle band.\n\nVolatility Indicator: The width of the bands is a measure of volatility. When the bands widen, it indicates increased volatility, and when they contract, it suggests decreased volatility.\nOverbought and Oversold Conditions: Prices are considered overbought near the upper band and oversold near the lower band. However, these conditions do not necessarily signal a reversal; prices can remain overbought or oversold for extended periods during strong trends." }, { - "objectID": "reference/index.html#finance-module-momentum-indicators", - "href": "reference/index.html#finance-module-momentum-indicators", - "title": "Function reference", - "section": "", - "text": "Momentum indicators for financial time series data.\n\n\n\naugment_macd\nCalculate MACD for a given financial instrument using either pandas or polars engine.\n\n\naugment_ppo\nCalculate PPO for a given financial instrument using either pandas or polars engine.\n\n\naugment_rsi\nThe augment_rsi function calculates the Relative Strength Index (RSI) for a given financial\n\n\naugment_cmo\nThe augment_cmo function calculates the Chande Momentum Oscillator (CMO) for a given financial\n\n\naugment_roc\nAdds rate of change (percentage change) to a Pandas DataFrame or DataFrameGroupBy object.\n\n\naugment_qsmomentum\nThe function augment_qsmomentum calculates Quant Science Momentum for financial data." + "objectID": "reference/augment_bbands.html#examples", + "href": "reference/augment_bbands.html#examples", + "title": "augment_bbands", + "section": "Examples", + "text": "Examples\n\nimport pandas as pd\nimport pytimetk as tk\n\ndf = tk.load_dataset(\"stocks_daily\", parse_dates = ['date'])\n\ndf\n\n\n\n\n\n\n\n\nsymbol\ndate\nopen\nhigh\nlow\nclose\nvolume\nadjusted\n\n\n\n\n0\nMETA\n2013-01-02\n27.440001\n28.180000\n27.420000\n28.000000\n69846400\n28.000000\n\n\n1\nMETA\n2013-01-03\n27.879999\n28.469999\n27.590000\n27.770000\n63140600\n27.770000\n\n\n2\nMETA\n2013-01-04\n28.010000\n28.930000\n27.830000\n28.760000\n72715400\n28.760000\n\n\n3\nMETA\n2013-01-07\n28.690001\n29.790001\n28.650000\n29.420000\n83781800\n29.420000\n\n\n4\nMETA\n2013-01-08\n29.510000\n29.600000\n28.860001\n29.059999\n45871300\n29.059999\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n16189\nGOOG\n2023-09-15\n138.800003\n139.360001\n137.179993\n138.300003\n48947600\n138.300003\n\n\n16190\nGOOG\n2023-09-18\n137.630005\n139.929993\n137.630005\n138.960007\n16233600\n138.960007\n\n\n16191\nGOOG\n2023-09-19\n138.250000\n139.175003\n137.500000\n138.830002\n15479100\n138.830002\n\n\n16192\nGOOG\n2023-09-20\n138.830002\n138.839996\n134.520004\n134.589996\n21473500\n134.589996\n\n\n16193\nGOOG\n2023-09-21\n132.389999\n133.190002\n131.089996\n131.360001\n22042700\n131.360001\n\n\n\n\n16194 rows × 8 columns\n\n\n\n\n# BBANDS pandas engine\ndf_bbands = (\n df\n .groupby('symbol')\n .augment_bbands(\n date_column = 'date', \n close_column='close', \n periods = [20, 40],\n std_dev = 2, \n engine = \"pandas\"\n )\n)\n\ndf_bbands.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 14 columns\nsymbol: object ['META', 'META', 'META', ' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00: ...\nopen: float64 [27.440000534057617, 27.87 ...\nhigh: float64 [28.18000030517578, 28.469 ...\nlow: float64 [27.420000076293945, 27.59 ...\nclose: float64 [28.0, 27.770000457763672, ...\nvolume: int64 [69846400, 63140600, 72715 ...\nadjusted: float64 [28.0, 27.770000457763672, ...\nclose_bband_middle_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_middle_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_40_2.0: float64 [nan, nan, nan, nan, nan, ...\n\n\n\n# BBANDS polars engine\ndf_bbands = (\n df\n .groupby('symbol')\n .augment_bbands(\n date_column = 'date', \n close_column='close', \n periods = [20, 40],\n std_dev = 2, \n engine = \"polars\"\n )\n)\n\ndf_bbands.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 16194 rows of 14 columns\nsymbol: object ['META', 'META', 'META', ' ...\ndate: datetime64[ns] [Timestamp('2013-01-02 00: ...\nopen: float64 [27.440000534057617, 27.87 ...\nhigh: float64 [28.18000030517578, 28.469 ...\nlow: float64 [27.420000076293945, 27.59 ...\nclose: float64 [28.0, 27.770000457763672, ...\nvolume: int64 [69846400, 63140600, 72715 ...\nadjusted: float64 [28.0, 27.770000457763672, ...\nclose_bband_middle_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_20_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_middle_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_upper_40_2.0: float64 [nan, nan, nan, nan, nan, ...\nclose_bband_lower_40_2.0: float64 [nan, nan, nan, nan, nan, ..." }, { - "objectID": "reference/index.html#finance-module-volatility-indicators", - "href": "reference/index.html#finance-module-volatility-indicators", - "title": "Function reference", + "objectID": "reference/apply_by_time.html", + "href": "reference/apply_by_time.html", + "title": "apply_by_time", "section": "", - "text": "Volatility indicators for financial time series data.\n\n\n\naugment_bbands\nThe augment_bbands function is used to calculate Bollinger Bands for a given dataset and return\n\n\naugment_atr\nThe augment_atr function is used to calculate Average True Range (ATR) and" + "text": "apply_by_time(data, date_column, freq='D', wide_format=False, fillna=0, reduce_memory=False, **named_funcs)\nApply for time series." }, { - "objectID": "reference/index.html#time-series-for-pandas-series", - "href": "reference/index.html#time-series-for-pandas-series", - "title": "Function reference", - "section": "", - "text": "Time series functions that generate / manipulate Pandas Series.\n\n\n\nmake_future_timeseries\nMake future dates for a time series.\n\n\nmake_weekday_sequence\nGenerate a sequence of weekday dates within a specified date range,\n\n\nmake_weekend_sequence\nGenerate a sequence of weekend dates within a specified date range,\n\n\nget_date_summary\nReturns a summary of the date-related information, including the number of\n\n\nget_frequency_summary\nMore robust version of pandas inferred frequency.\n\n\nget_diff_summary\nCalculates summary statistics of the time differences between consecutive values in a datetime index.\n\n\nget_frequency\nGet the frequency of a pandas Series or DatetimeIndex.\n\n\nget_seasonal_frequency\nThe get_seasonal_frequency function returns the seasonal period of a given\n\n\nget_trend_frequency\nThe get_trend_frequency function returns the trend period of a given time\n\n\nget_timeseries_signature\nConvert a timestamp to a set of 29 time series features.\n\n\nget_holiday_signature\nEngineers 4 different holiday features from a single datetime for 137 countries" + "objectID": "reference/apply_by_time.html#parameters", + "href": "reference/apply_by_time.html#parameters", + "title": "apply_by_time", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter can be either a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the data on which the apply operation will be performed.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the DataFrame that contains the dates.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the data should be resampled. It accepts a string representing a time frequency, such as “D” for daily, “W” for weekly, “M” for monthly, etc. The default value is “D”, which means the data will be resampled on a daily basis. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nwide_format\nbool\nThe wide_format parameter is a boolean flag that determines whether the output should be in wide format or not. If wide_format is set to True, the output will have a multi-index column structure, where the first level represents the original columns and the second level represents the group names.\nFalse\n\n\nfillna\nint\nThe fillna parameter is used to specify the value that will be used to fill missing values in the resulting DataFrame. By default, it is set to 0.\n0\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse\n\n\n**named_funcs\n\nThe **named_funcs parameter is used to specify one or more custom aggregation functions to apply to the data. It accepts named functions in the format: python name = lambda df: df['column1'].corr(df['column2']]) Where name is the name of the function and df is the DataFrame that will be passed to the function. The function must return a single value.\n{}" }, { - "objectID": "reference/index.html#date-utilities", - "href": "reference/index.html#date-utilities", - "title": "Function reference", - "section": "", - "text": "Helper functions to make your life easier.\n\n\n\nfloor_date\nRobust date flooring.\n\n\nceil_date\nRobust date ceiling.\n\n\nis_holiday\nCheck if a given list of dates are holidays for a specified country.\n\n\nweek_of_month\nThe “week_of_month” function calculates the week number of a given date\n\n\ntimeseries_unit_frequency_table\nThe function timeseries_unit_frequency_table returns a pandas DataFrame\n\n\ntime_scale_template\nThe function time_scale_template returns a table with time scale" + "objectID": "reference/apply_by_time.html#returns", + "href": "reference/apply_by_time.html#returns", + "title": "apply_by_time", + "section": "Returns", + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe function apply_by_time returns a pandas DataFrame object." }, { - "objectID": "reference/index.html#visualization-utilities", - "href": "reference/index.html#visualization-utilities", - "title": "Function reference", - "section": "", - "text": "Helper functions to make your life easier.\n\n\n\ntheme_timetk\nReturns a plotnine theme with timetk styles applied, allowing for\n\n\npalette_timetk\nThe function palette_timetk returns a dictionary of color codes for" + "objectID": "reference/apply_by_time.html#examples", + "href": "reference/apply_by_time.html#examples", + "title": "apply_by_time", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n \ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n \ndf.glimpse()\n\n<class 'pandas.core.frame.DataFrame'>: 2466 rows of 13 columns\norder_id: int64 [1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, ...\norder_line: int64 [1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, ...\norder_date: datetime64[ns] [Timestamp('2011-01-07 00:00:00'), Ti ...\nquantity: int64 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, ...\nprice: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\ntotal_price: int64 [6070, 5970, 2770, 5970, 10660, 3200, ...\nmodel: object ['Jekyll Carbon 2', 'Trigger Carbon 2 ...\ncategory_1: object ['Mountain', 'Mountain', 'Mountain', ...\ncategory_2: object ['Over Mountain', 'Over Mountain', 'T ...\nframe_material: object ['Carbon', 'Carbon', 'Aluminum', 'Car ...\nbikeshop_name: object ['Ithaca Mountain Climbers', 'Ithaca ...\ncity: object ['Ithaca', 'Ithaca', 'Kansas City', ' ...\nstate: object ['NY', 'NY', 'KS', 'KS', 'KY', 'KY', ...\n\n\n\n# Apply by time with a DataFrame object\n# Allows access to multiple columns at once\n( \n df[['order_date', 'price', 'quantity']] \n .apply_by_time(\n \n # Named apply functions\n price_quantity_sum = lambda df: (df['price'] * df['quantity']).sum(),\n price_quantity_mean = lambda df: (df['price'] * df['quantity']).mean(),\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\norder_date\nprice_quantity_sum\nprice_quantity_mean\n\n\n\n\n0\n2011-01-01\n483015.0\n4600.142857\n\n\n1\n2011-02-01\n1162075.0\n4611.408730\n\n\n2\n2011-03-01\n659975.0\n5196.653543\n\n\n3\n2011-04-01\n1827140.0\n4533.846154\n\n\n4\n2011-05-01\n844170.0\n4097.912621\n\n\n5\n2011-06-01\n1413445.0\n4544.839228\n\n\n6\n2011-07-01\n1194430.0\n4976.791667\n\n\n7\n2011-08-01\n679790.0\n4961.970803\n\n\n8\n2011-09-01\n814720.0\n4682.298851\n\n\n9\n2011-10-01\n734920.0\n3930.053476\n\n\n10\n2011-11-01\n1006085.0\n4768.175355\n\n\n11\n2011-12-01\n473120.0\n4186.902655\n\n\n\n\n\n\n\n\n# Apply by time with a GroupBy object\n( \n df[['category_1', 'order_date', 'price', 'quantity']] \n .groupby('category_1')\n .apply_by_time(\n \n # Named functions\n price_quantity_sum = lambda df: (df['price'] * df['quantity']).sum(),\n price_quantity_mean = lambda df: (df['price'] * df['quantity']).mean(),\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\ncategory_1\norder_date\nprice_quantity_sum\nprice_quantity_mean\n\n\n\n\n0\nMountain\n2011-01-01\n221490.0\n4922.000000\n\n\n1\nMountain\n2011-02-01\n660555.0\n4374.536424\n\n\n2\nMountain\n2011-03-01\n358855.0\n5882.868852\n\n\n3\nMountain\n2011-04-01\n1075975.0\n4890.795455\n\n\n4\nMountain\n2011-05-01\n450440.0\n4549.898990\n\n\n5\nMountain\n2011-06-01\n723040.0\n5021.111111\n\n\n6\nMountain\n2011-07-01\n767740.0\n5444.964539\n\n\n7\nMountain\n2011-08-01\n361255.0\n5734.206349\n\n\n8\nMountain\n2011-09-01\n401125.0\n5077.531646\n\n\n9\nMountain\n2011-10-01\n377335.0\n4439.235294\n\n\n10\nMountain\n2011-11-01\n549345.0\n5282.163462\n\n\n11\nMountain\n2011-12-01\n276055.0\n5208.584906\n\n\n12\nRoad\n2011-01-01\n261525.0\n4358.750000\n\n\n13\nRoad\n2011-02-01\n501520.0\n4965.544554\n\n\n14\nRoad\n2011-03-01\n301120.0\n4562.424242\n\n\n15\nRoad\n2011-04-01\n751165.0\n4104.726776\n\n\n16\nRoad\n2011-05-01\n393730.0\n3679.719626\n\n\n17\nRoad\n2011-06-01\n690405.0\n4134.161677\n\n\n18\nRoad\n2011-07-01\n426690.0\n4310.000000\n\n\n19\nRoad\n2011-08-01\n318535.0\n4304.527027\n\n\n20\nRoad\n2011-09-01\n413595.0\n4353.631579\n\n\n21\nRoad\n2011-10-01\n357585.0\n3505.735294\n\n\n22\nRoad\n2011-11-01\n456740.0\n4268.598131\n\n\n23\nRoad\n2011-12-01\n197065.0\n3284.416667\n\n\n\n\n\n\n\n\n# Return complex objects\n( \n df[['order_date', 'price', 'quantity']] \n .apply_by_time(\n \n # Named apply functions\n complex_object = lambda df: [df],\n \n # Parameters\n date_column = 'order_date', \n freq = \"MS\",\n \n )\n)\n\n\n\n\n\n\n\n\norder_date\nprice\nquantity\n\n\n\n\n0\n2011-01-01\n[[6070, 5970, 2770, 5970, 10660, 3200, 12790, ...\n[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,...\n\n\n1\n2011-02-01\n[[8200, 7990, 3200, 4800, 3200, 2130, 1030, 37...\n[[1, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 2,...\n\n\n2\n2011-03-01\n[[2660, 3200, 3200, 815, 8200, 9060, 815, 2130...\n[[1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1,...\n\n\n3\n2011-04-01\n[[5330, 4500, 585, 2660, 3200, 2770, 1030, 234...\n[[1, 1, 1, 3, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1, 7,...\n\n\n4\n2011-05-01\n[[1840, 3200, 7000, 5860, 1030, 3200, 3500, 15...\n[[1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n5\n2011-06-01\n[[7990, 4500, 1250, 3730, 1950, 2660, 2340, 19...\n[[1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 9,...\n\n\n6\n2011-07-01\n[[3200, 2880, 5330, 3200, 585, 5330, 4800, 111...\n[[2, 3, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1,...\n\n\n7\n2011-08-01\n[[12250, 2130, 7000, 2660, 5860, 3500, 1950, 1...\n[[2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,...\n\n\n8\n2011-09-01\n[[4800, 480, 12790, 6390, 7990, 3500, 3730, 63...\n[[1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n9\n2011-10-01\n[[9060, 12250, 2880, 9060, 4480, 3200, 2340, 2...\n[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...\n\n\n10\n2011-11-01\n[[2240, 2660, 3200, 980, 2880, 1750, 2130, 224...\n[[1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 2, 1, 6,...\n\n\n11\n2011-12-01\n[[1030, 3200, 870, 1350, 4260, 7460, 2880, 270...\n[[1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1,..." }, { - "objectID": "reference/index.html#extra-pandas-helpers-that-help-beyond-just-time-series", - "href": "reference/index.html#extra-pandas-helpers-that-help-beyond-just-time-series", - "title": "Function reference", + "objectID": "reference/plot_anomalies_decomp.html", + "href": "reference/plot_anomalies_decomp.html", + "title": "plot_anomalies_decomp", "section": "", - "text": "glimpse\nTakes a pandas DataFrame and prints a summary of its dimensions, column\n\n\nparallel_apply\nThe parallel_apply function parallelizes the application of a function on\n\n\nprogress_apply\nAdds a progress bar to pandas apply().\n\n\ndrop_zero_variance\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with\n\n\ntransform_columns\nThe function transform_columns applies a user-provided function to specified columns in a pandas DataFrame.\n\n\nflatten_multiindex_column_names\nTakes a DataFrame as input and flattens the column" + "text": "plot_anomalies_decomp(data, date_column, line_color='#2c3e50', line_size=None, line_type='solid', line_alpha=1.0, y_intercept=None, y_intercept_color='#2c3e50', x_intercept=None, x_intercept_color='#2c3e50', title='Anomaly Decomposition Plot', x_lab='', y_lab='', x_axis_date_labels='%b %Y', base_size=11, width=None, height=None, engine='plotly')\nThe plot_anomalies_decomp function takes in data from the anomalize() function, and returns a plot of the anomaly decomposition." }, { - "objectID": "reference/index.html#datasets", - "href": "reference/index.html#datasets", - "title": "Function reference", - "section": "", - "text": "Practice pytimetk with 13 complementary time series datasets.\n\n\n\nget_available_datasets\nGet a list of 12 datasets that can be loaded with pytimetk.load_dataset.\n\n\nload_dataset\nLoad one of 12 Time Series Datasets." + "objectID": "reference/plot_anomalies_decomp.html#parameters", + "href": "reference/plot_anomalies_decomp.html#parameters", + "title": "plot_anomalies_decomp", + "section": "Parameters", + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe input data for the plot from anomalize. It can be either a pandas DataFrame or a pandas DataFrameGroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data that contains the dates.\nrequired\n\n\nline_color\nstr\nThe color of the line in the plot. It is specified as a hexadecimal color code. The default value is “#2c3e50”.\n'#2c3e50'\n\n\nline_size\nOptional[float]\nThe line_size parameter determines the thickness of the lines in the plot. It is an optional parameter, so if you don’t specify a value, the default line size will be used.\nNone\n\n\nline_type\nstr\nThe line_type parameter specifies the type of line to be used in the plot. It can take the following values: - “solid” (default): a solid line - “dashed”: a dashed line\n'solid'\n\n\nline_alpha\nfloat\nThe line_alpha parameter controls the transparency of the lines in the plot. It accepts a float value between 0 and 1, where 0 means completely transparent and 1 means completely opaque.\n1.0\n\n\ny_intercept\nOptional[float]\nThe y_intercept parameter is an optional float value that specifies the y-coordinate of a horizontal line to be plotted on the graph. This line can be used to indicate a specific threshold or reference value. If not specified, no horizontal line will be plotted.\nNone\n\n\ny_intercept_color\nstr\nThe y_intercept_color parameter is used to specify the color of the y-intercept line on the plot. By default, it is set to \"#2c3e50\", which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\nx_intercept\nOptional[str]\nThe x_intercept parameter is used to specify the value on the x-axis where you want to draw a vertical line. This can be useful for highlighting a specific point or event in the data.\nNone\n\n\nx_intercept_color\nstr\nThe x_intercept_color parameter is used to specify the color of the vertical line representing the x-intercept on the plot. By default, it is set to “#2c3e50”, which is a dark blue color. You can change this parameter to any valid color code or name to change the color of the line.\n'#2c3e50'\n\n\ntitle\nstr\nThe title of the plot. It is set to “Anomaly Decomposition Plot” by default.\n'Anomaly Decomposition Plot'\n\n\nx_lab\nstr\nThe x_lab parameter is used to specify the label for the x-axis of the plot. It is a string that represents the label text.\n''\n\n\ny_lab\nstr\nThe y_lab parameter is used to specify the label for the y-axis of the plot. It is a string that represents the label text.\n''\n\n\nx_axis_date_labels\nstr\nThe x_axis_date_labels parameter is used to specify the format of the date labels on the x-axis of the plot. It accepts a string representing the format of the date labels. For example, “%b %Y” would display the month abbreviation and year (e.g., Jan 2019).\n'%b %Y'\n\n\nbase_size\nfloat\nThe base_size parameter determines the base font size for the plot. It is used to control the size of the text elements in the plot, such as axis labels, titles, and tick labels. The default value is 11, but you can adjust it to make the text larger or smaller\n11\n\n\nwidth\nOptional[int]\nThe width parameter determines the width of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with the default width.\nNone\n\n\nheight\nOptional[int]\nThe height parameter determines the height of the plot in pixels. It is an optional parameter, so if you don’t specify a value, the plot will be displayed with a default height.\nNone\n\n\nengine\nstr\nThe engine parameter specifies the plotting engine to use. It can be set to either “plotly”, “plotnine”, or “matplotlib”.\n'plotly'" }, { - "objectID": "reference/drop_zero_variance.html", - "href": "reference/drop_zero_variance.html", - "title": "drop_zero_variance", - "section": "", - "text": "drop_zero_variance(data)\nThe function drop_zero_variance takes a pandas DataFrame as input and returns a new DataFrame with columns that have zero variance removed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the\nrequired\n\n\ndata\npd.DataFrame\n\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\n\na filtered DataFrame with columns that have non-zero variance." + "objectID": "reference/plot_anomalies_decomp.html#returns", + "href": "reference/plot_anomalies_decomp.html#returns", + "title": "plot_anomalies_decomp", + "section": "Returns", + "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\nA plotly, plotnine, or matplotlib plot." }, { - "objectID": "reference/drop_zero_variance.html#parameters", - "href": "reference/drop_zero_variance.html#parameters", - "title": "drop_zero_variance", - "section": "", - "text": "Name\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame\nThe data parameter is a pandas DataFrame or a pandas DataFrameGroupBy object. It represents the\nrequired\n\n\ndata\npd.DataFrame\n\nrequired" + "objectID": "reference/plot_anomalies_decomp.html#see-also", + "href": "reference/plot_anomalies_decomp.html#see-also", + "title": "plot_anomalies_decomp", + "section": "See Also", + "text": "See Also\n\nanomalize : Function that calculates the anomalies and formats the data for visualization.\nplot_anomalies : Function that plots the anomalies." }, { - "objectID": "reference/drop_zero_variance.html#returns", - "href": "reference/drop_zero_variance.html#returns", - "title": "drop_zero_variance", - "section": "", - "text": "Type\nDescription\n\n\n\n\n\na filtered DataFrame with columns that have non-zero variance." + "objectID": "reference/plot_anomalies_decomp.html#examples", + "href": "reference/plot_anomalies_decomp.html#examples", + "title": "plot_anomalies_decomp", + "section": "Examples", + "text": "Examples\n\n# EXAMPLE 1: SINGLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n# Create a date range\ndate_rng = pd.date_range(start='2021-01-01', end='2024-01-01', freq='MS')\n\n# Generate some random data with a few outliers\nnp.random.seed(42)\ndata = np.random.randn(len(date_rng)) * 10 + 25 \ndata[3] = 100 # outlier\n\n# Create a DataFrame\ndf = pd.DataFrame(date_rng, columns=['date'])\ndf['value'] = data\n\n# Anomalize the data\nanomalize_df = tk.anomalize(\n df, \"date\", \"value\",\n method = \"twitter\", \n iqr_alpha = 0.10, \n clean_alpha = 0.75,\n clean = \"min_max\",\n verbose = True,\n)\n\n# Visualize the results, plotly\nanomalize_df.plot_anomalies_decomp(\"date\", engine = 'plotly')\n\nUsing seasonal frequency of 12 observations\nUsing trend frequency of 37 observations\n\n\n\n \n\n\n\n# Visualize the results, plotnine\nanomalize_df.plot_anomalies_decomp(\"date\", engine = \"plotnine\")\n\n\n\n\n<Figure Size: (700 x 500)>\n\n\n\n# EXAMPLE 2: MULTIPLE TIME SERIES\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset(\"walmart_sales_weekly\", parse_dates=[\"Date\"])[[\"id\", \"Date\", \"Weekly_Sales\"]]\n\nanomalize_df = (\n df\n .groupby('id') \n .anomalize(\n \"Date\", \"Weekly_Sales\", \n period = 52, \n trend = 52, \n threads = 1\n ) \n)\n\n# Visualize the decomposition results, plotly\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotly', \n )\n)\n\n\n\n\n\n \n\n\n\n# Visualize the decomposition results, plotnine\n\n(\n anomalize_df\n .groupby(\"id\")\n .plot_anomalies_decomp(\n date_column = \"Date\",\n line_color = \"steelblue\",\n width = 1200,\n height = 800,\n x_axis_date_labels = \"%y\",\n engine = 'plotnine', \n )\n)\n\n\n\n\n<Figure Size: (1200 x 800)>" }, { - "objectID": "reference/summarize_by_time.html", - "href": "reference/summarize_by_time.html", - "title": "summarize_by_time", + "objectID": "reference/parallel_apply.html", + "href": "reference/parallel_apply.html", + "title": "parallel_apply", "section": "", - "text": "summarize_by_time(data, date_column, value_column, freq='D', agg_func='sum', wide_format=False, fillna=0, engine='pandas')\nSummarize a DataFrame or GroupBy object by time.\nThe summarize_by_time function aggregates data by a specified time period and one or more numeric columns, allowing for grouping and customization of the time-based aggregation." + "text": "parallel_apply(data, func, show_progress=True, threads=None, desc='Processing...', **kwargs)\nThe parallel_apply function parallelizes the application of a function on grouped dataframes using concurrent.futures." }, { - "objectID": "reference/summarize_by_time.html#parameters", - "href": "reference/summarize_by_time.html#parameters", - "title": "summarize_by_time", + "objectID": "reference/parallel_apply.html#parameters", + "href": "reference/parallel_apply.html#parameters", + "title": "parallel_apply", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nA pandas DataFrame or a pandas GroupBy object. This is the data that you want to summarize by time.\nrequired\n\n\ndate_column\nstr\nThe name of the column in the data frame that contains the dates or timestamps to be aggregated by. This column must be of type datetime64.\nrequired\n\n\nvalue_column\nstr or list\nThe value_column parameter is the name of one or more columns in the DataFrame that you want to aggregate by. It can be either a string representing a single column name, or a list of strings representing multiple column names.\nrequired\n\n\nfreq\nstr\nThe freq parameter specifies the frequency at which the data should be aggregated. It accepts a string representing a pandas frequency offset, such as “D” for daily or “MS” for month start. The default value is “D”, which means the data will be aggregated on a daily basis. Some common frequency aliases include: - S: secondly frequency - min: minute frequency - H: hourly frequency - D: daily frequency - W: weekly frequency - M: month end frequency - MS: month start frequency - Q: quarter end frequency - QS: quarter start frequency - Y: year end frequency - YS: year start frequency\n'D'\n\n\nagg_func\nlist\nThe agg_func parameter is used to specify one or more aggregating functions to apply to the value column(s) during the summarization process. It can be a single function or a list of functions. The default value is \"sum\", which represents the sum function. Some common aggregating functions include: - “sum”: Sum of values - “mean”: Mean of values - “median”: Median of values - “min”: Minimum of values - “max”: Maximum of values - “std”: Standard deviation of values - “var”: Variance of values - “first”: First value in group - “last”: Last value in group - “count”: Count of values - “nunique”: Number of unique values - “corr”: Correlation between values Pandas Engine Only: Custom lambda aggregating functions can be used too. Here are several common examples: - (“q25”, lambda x: x.quantile(0.25)): 25th percentile of values - (“q75”, lambda x: x.quantile(0.75)): 75th percentile of values - (“iqr”, lambda x: x.quantile(0.75) - x.quantile(0.25)): Interquartile range of values - (“range”, lambda x: x.max() - x.min()): Range of values\n'sum'\n\n\nwide_format\nbool\nA boolean parameter that determines whether the output should be in “wide” or “long” format. If set to True, the output will be in wide format, where each group is represented by a separate column. If set to False, the output will be in long format, where each group is represented by a separate row. The default value is False.\nFalse\n\n\nfillna\nint\nThe fillna parameter is used to specify the value to fill missing data with. By default, it is set to 0. If you want to keep missing values as NaN, you can use np.nan as the value for fillna.\n0\n\n\nengine\nstr\nThe engine parameter is used to specify the engine to use for summarizing the data. It can be either “pandas” or “polars”. - The default value is “pandas”. - When “polars”, the function will internally use the polars library for summarizing the data. This can be faster than using “pandas” for large datasets.\n'pandas'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.core.groupby.generic.DataFrameGroupBy\nThe data parameter is a Pandas DataFrameGroupBy object, which is the result of grouping a DataFrame by one or more columns. It represents the grouped data that you want to apply the function to.\nrequired\n\n\nfunc\nCallable\nThe func parameter is the function that you want to apply to each group in the grouped dataframe. This function should take a single argument, which is a dataframe representing a group, and return a result. The result can be a scalar value, a pandas Series, or a pandas DataFrame.\nrequired\n\n\nshow_progress\nbool\nA boolean parameter that determines whether to display progress using tqdm. If set to True, progress will be displayed. If set to False, progress will not be displayed.\nTrue\n\n\nthreads\nint\nThe threads parameter specifies the number of threads to use for parallel processing. If threads is set to None, it will use all available processors. If threads is set to -1, it will use all available processors as well.\nNone\n\n\n**kwargs\n\nThe **kwargs parameter is a dictionary of keyword arguments that are passed to the func function.\n{}" }, { - "objectID": "reference/summarize_by_time.html#returns", - "href": "reference/summarize_by_time.html#returns", - "title": "summarize_by_time", + "objectID": "reference/parallel_apply.html#returns", + "href": "reference/parallel_apply.html#returns", + "title": "parallel_apply", "section": "Returns", - "text": "Returns\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA Pandas DataFrame that is summarized by time." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe parallel_apply function returns a combined result after applying the specified function on all groups in the grouped dataframe. The result can be a pandas DataFrame or a pandas Series, depending on the function applied." }, { - "objectID": "reference/summarize_by_time.html#examples", - "href": "reference/summarize_by_time.html#examples", - "title": "summarize_by_time", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = tk.load_dataset('bike_sales_sample', parse_dates = ['order_date'])\n\ndf\n\n\n\n\n\n\n\n\norder_id\norder_line\norder_date\nquantity\nprice\ntotal_price\nmodel\ncategory_1\ncategory_2\nframe_material\nbikeshop_name\ncity\nstate\n\n\n\n\n0\n1\n1\n2011-01-07\n1\n6070\n6070\nJekyll Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n1\n1\n2\n2011-01-07\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nIthaca Mountain Climbers\nIthaca\nNY\n\n\n2\n2\n1\n2011-01-10\n1\n2770\n2770\nBeast of the East 1\nMountain\nTrail\nAluminum\nKansas City 29ers\nKansas City\nKS\n\n\n3\n2\n2\n2011-01-10\n1\n5970\n5970\nTrigger Carbon 2\nMountain\nOver Mountain\nCarbon\nKansas City 29ers\nKansas City\nKS\n\n\n4\n3\n1\n2011-01-10\n1\n10660\n10660\nSupersix Evo Hi-Mod Team\nRoad\nElite Road\nCarbon\nLouisville Race Equipment\nLouisville\nKY\n\n\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n...\n\n\n2461\n321\n3\n2011-12-22\n1\n1410\n1410\nCAAD8 105\nRoad\nElite Road\nAluminum\nMiami Race Equipment\nMiami\nFL\n\n\n2462\n322\n1\n2011-12-28\n1\n1250\n1250\nSynapse Disc Tiagra\nRoad\nEndurance Road\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2463\n322\n2\n2011-12-28\n1\n2660\n2660\nBad Habit 2\nMountain\nTrail\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2464\n322\n3\n2011-12-28\n1\n2340\n2340\nF-Si 1\nMountain\nCross Country Race\nAluminum\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n2465\n322\n4\n2011-12-28\n1\n5860\n5860\nSynapse Hi-Mod Dura Ace\nRoad\nEndurance Road\nCarbon\nPhoenix Bi-peds\nPhoenix\nAZ\n\n\n\n\n2466 rows × 13 columns\n\n\n\n\n# Example 1 - Summarize by time with a DataFrame object, pandas engine\n( \n df \n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price',\n freq = \"MS\",\n agg_func = ['mean', 'sum'],\n engine = 'pandas'\n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_mean\ntotal_price_sum\n\n\n\n\n0\n2011-01-01\n4600.142857\n483015\n\n\n1\n2011-02-01\n4611.408730\n1162075\n\n\n2\n2011-03-01\n5196.653543\n659975\n\n\n3\n2011-04-01\n4533.846154\n1827140\n\n\n4\n2011-05-01\n4097.912621\n844170\n\n\n5\n2011-06-01\n4544.839228\n1413445\n\n\n6\n2011-07-01\n4976.791667\n1194430\n\n\n7\n2011-08-01\n4961.970803\n679790\n\n\n8\n2011-09-01\n4682.298851\n814720\n\n\n9\n2011-10-01\n3930.053476\n734920\n\n\n10\n2011-11-01\n4768.175355\n1006085\n\n\n11\n2011-12-01\n4186.902655\n473120\n\n\n\n\n\n\n\n\n# Example 2 - Summarize by time with a GroupBy object (Wide Format), polars engine\n(\n df \n .groupby(['category_1', 'frame_material']) \n .summarize_by_time(\n date_column = 'order_date', \n value_column = ['total_price', 'quantity'], \n freq = 'MS',\n agg_func = 'sum',\n wide_format = True, \n engine = 'polars'\n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_sum_Mountain_Aluminum\ntotal_price_sum_Mountain_Carbon\ntotal_price_sum_Road_Aluminum\ntotal_price_sum_Road_Carbon\nquantity_sum_Mountain_Aluminum\nquantity_sum_Mountain_Carbon\nquantity_sum_Road_Aluminum\nquantity_sum_Road_Carbon\n\n\n\n\n0\n2011-01-01\n66290\n155200\n61005\n200520\n34\n23\n30\n41\n\n\n1\n2011-02-01\n245115\n415440\n100480\n401040\n118\n68\n52\n93\n\n\n2\n2011-03-01\n82025\n276830\n63390\n237730\n41\n46\n33\n54\n\n\n3\n2011-04-01\n340725\n735250\n197705\n553460\n164\n130\n104\n144\n\n\n4\n2011-05-01\n160130\n290310\n127600\n266130\n93\n53\n75\n81\n\n\n5\n2011-06-01\n183680\n539360\n174655\n515750\n96\n91\n82\n142\n\n\n6\n2011-07-01\n186030\n581710\n98090\n328600\n94\n91\n53\n82\n\n\n7\n2011-08-01\n119785\n241470\n65855\n252680\n53\n34\n36\n69\n\n\n8\n2011-09-01\n100455\n300670\n78485\n335110\n59\n47\n36\n77\n\n\n9\n2011-10-01\n105035\n272300\n83105\n274480\n61\n43\n42\n71\n\n\n10\n2011-11-01\n102045\n447300\n90050\n366690\n55\n79\n51\n95\n\n\n11\n2011-12-01\n111125\n164930\n45555\n151510\n55\n27\n27\n43\n\n\n\n\n\n\n\n\n# Example 3 - Summarize by time with a GroupBy object (Wide Format)\n(\n df \n .groupby('category_1') \n .summarize_by_time(\n date_column = 'order_date', \n value_column = 'total_price', \n freq = 'MS',\n agg_func = 'sum',\n wide_format = True,\n engine = 'pandas' \n )\n)\n\n\n\n\n\n\n\n\norder_date\ntotal_price_Mountain\ntotal_price_Road\n\n\n\n\n0\n2011-01-01\n221490\n261525\n\n\n1\n2011-02-01\n660555\n501520\n\n\n2\n2011-03-01\n358855\n301120\n\n\n3\n2011-04-01\n1075975\n751165\n\n\n4\n2011-05-01\n450440\n393730\n\n\n5\n2011-06-01\n723040\n690405\n\n\n6\n2011-07-01\n767740\n426690\n\n\n7\n2011-08-01\n361255\n318535\n\n\n8\n2011-09-01\n401125\n413595\n\n\n9\n2011-10-01\n377335\n357585\n\n\n10\n2011-11-01\n549345\n456740\n\n\n11\n2011-12-01\n276055\n197065\n\n\n\n\n\n\n\n\n# Example 4 - Summarize by time with a GroupBy object and multiple value columns and summaries (Wide Format)\n# Note - This example only works with the pandas engine\n(\n df \n .groupby('category_1') \n .summarize_by_time(\n date_column = 'order_date', \n value_column = ['total_price', 'quantity'], \n freq = 'MS',\n agg_func = [\n 'sum', \n 'mean', \n ('q25', lambda x: x.quantile(0.25)), \n ('q75', lambda x: x.quantile(0.75))\n ],\n wide_format = False,\n engine = 'pandas' \n )\n)\n\n\n\n\n\n\n\n\ncategory_1\norder_date\ntotal_price_sum\ntotal_price_mean\ntotal_price_q25\ntotal_price_q75\nquantity_sum\nquantity_mean\nquantity_q25\nquantity_q75\n\n\n\n\n0\nMountain\n2011-01-01\n221490\n4922.000000\n2060.0\n6070.0\n57\n1.266667\n1.0\n1.0\n\n\n1\nMountain\n2011-02-01\n660555\n4374.536424\n2060.0\n5330.0\n186\n1.231788\n1.0\n1.0\n\n\n2\nMountain\n2011-03-01\n358855\n5882.868852\n2130.0\n6390.0\n87\n1.426230\n1.0\n1.0\n\n\n3\nMountain\n2011-04-01\n1075975\n4890.795455\n2060.0\n5970.0\n294\n1.336364\n1.0\n1.0\n\n\n4\nMountain\n2011-05-01\n450440\n4549.898990\n2010.0\n6020.0\n146\n1.474747\n1.0\n1.0\n\n\n5\nMountain\n2011-06-01\n723040\n5021.111111\n1950.0\n5647.5\n187\n1.298611\n1.0\n1.0\n\n\n6\nMountain\n2011-07-01\n767740\n5444.964539\n2130.0\n6400.0\n185\n1.312057\n1.0\n1.0\n\n\n7\nMountain\n2011-08-01\n361255\n5734.206349\n2235.0\n6400.0\n87\n1.380952\n1.0\n2.0\n\n\n8\nMountain\n2011-09-01\n401125\n5077.531646\n1620.0\n6390.0\n106\n1.341772\n1.0\n1.0\n\n\n9\nMountain\n2011-10-01\n377335\n4439.235294\n2160.0\n6070.0\n104\n1.223529\n1.0\n1.0\n\n\n10\nMountain\n2011-11-01\n549345\n5282.163462\n2340.0\n7460.0\n134\n1.288462\n1.0\n1.0\n\n\n11\nMountain\n2011-12-01\n276055\n5208.584906\n2060.0\n6400.0\n82\n1.547170\n1.0\n1.0\n\n\n12\nRoad\n2011-01-01\n261525\n4358.750000\n1950.0\n5605.0\n71\n1.183333\n1.0\n1.0\n\n\n13\nRoad\n2011-02-01\n501520\n4965.544554\n1950.0\n5860.0\n145\n1.435644\n1.0\n1.0\n\n\n14\nRoad\n2011-03-01\n301120\n4562.424242\n2240.0\n5875.0\n87\n1.318182\n1.0\n1.0\n\n\n15\nRoad\n2011-04-01\n751165\n4104.726776\n1950.0\n4800.0\n248\n1.355191\n1.0\n1.0\n\n\n16\nRoad\n2011-05-01\n393730\n3679.719626\n1570.0\n3500.0\n156\n1.457944\n1.0\n1.0\n\n\n17\nRoad\n2011-06-01\n690405\n4134.161677\n1840.0\n4500.0\n224\n1.341317\n1.0\n1.0\n\n\n18\nRoad\n2011-07-01\n426690\n4310.000000\n1895.0\n5330.0\n135\n1.363636\n1.0\n1.0\n\n\n19\nRoad\n2011-08-01\n318535\n4304.527027\n1950.0\n4987.5\n105\n1.418919\n1.0\n1.0\n\n\n20\nRoad\n2011-09-01\n413595\n4353.631579\n1950.0\n5330.0\n113\n1.189474\n1.0\n1.0\n\n\n21\nRoad\n2011-10-01\n357585\n3505.735294\n1750.0\n4260.0\n113\n1.107843\n1.0\n1.0\n\n\n22\nRoad\n2011-11-01\n456740\n4268.598131\n1950.0\n4370.0\n146\n1.364486\n1.0\n1.0\n\n\n23\nRoad\n2011-12-01\n197065\n3284.416667\n1652.5\n3200.0\n70\n1.166667\n1.0\n1.0" + "objectID": "reference/parallel_apply.html#examples", + "href": "reference/parallel_apply.html#examples", + "title": "parallel_apply", + "section": "Examples:", + "text": "Examples:\n\n# Example 1 - Single argument returns Series\n\nimport pytimetk as tk\nimport pandas as pd \n\ndf = pd.DataFrame({\n 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar'],\n 'B': [1, 2, 3, 4, 5, 6]\n})\n\ngrouped = df.groupby('A')\n\nresult = grouped.apply(lambda df: df['B'].sum())\nresult\n\nresult = tk.parallel_apply(grouped, lambda df: df['B'].sum(), show_progress=True, threads=2)\nresult\n\n\n\n\nA\nbar 12\nfoo 9\ndtype: int64\n\n\n\n# Example 2 - Multiple arguments returns MultiIndex DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': ['one', 'one', 'one', 'two', 'two', 'two', 'one', 'two'],\n 'C': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return pd.DataFrame({\n 'sum': [group['C'].sum()],\n 'mean': [group['C'].mean()]\n })\n\ngrouped = df.groupby(['A', 'B'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\n\nsum\nmean\n\n\nA\nB\n\n\n\n\n\n\n\nbar\none\n0\n5\n5.000000\n\n\ntwo\n0\n9\n4.500000\n\n\nfoo\none\n0\n8\n2.666667\n\n\ntwo\n0\n15\n7.500000\n\n\n\n\n\n\n\n\n# Example 3 - Multiple arguments returns MultiIndex DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': ['one', 'one', 'one', 'two', 'two', 'two', 'one', 'two'],\n 'C': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return group.head(2)\n\ngrouped = df.groupby(['A', 'B'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\n\nA\nB\nC\n\n\nA\nB\n\n\n\n\n\n\n\n\nbar\none\n2\nbar\none\n5\n\n\ntwo\n3\nbar\ntwo\n7\n\n\n5\nbar\ntwo\n2\n\n\nfoo\none\n0\nfoo\none\n1\n\n\n1\nfoo\none\n3\n\n\ntwo\n4\nfoo\ntwo\n9\n\n\n7\nfoo\ntwo\n6\n\n\n\n\n\n\n\n\n# Example 4 - Single Grouping Column Returns DataFrame\n\nimport pytimetk as tk\nimport pandas as pd\n\ndf = pd.DataFrame({\n 'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'bar', 'foo', 'foo'],\n 'B': [1, 3, 5, 7, 9, 2, 4, 6]\n})\n\ndef calculate(group):\n return pd.DataFrame({\n 'sum': [group['B'].sum()],\n 'mean': [group['B'].mean()]\n })\n\ngrouped = df.groupby(['A'])\n\nresult = grouped.apply(calculate)\nresult\n\nresult = tk.parallel_apply(grouped, calculate, show_progress=True)\nresult\n\n\n\n\n\n\n\n\n\n\n\n\nsum\nmean\n\n\nA\n\n\n\n\n\n\n\nbar\n0\n14\n4.666667\n\n\nfoo\n0\n23\n4.600000" }, { - "objectID": "reference/filter_by_time.html", - "href": "reference/filter_by_time.html", - "title": "filter_by_time", + "objectID": "reference/augment_expanding_apply.html", + "href": "reference/augment_expanding_apply.html", + "title": "augment_expanding_apply", "section": "", - "text": "filter_by_time(data, date_column, start_date='start', end_date='end', engine='pandas')\nFilters a DataFrame or GroupBy object based on a specified date range.\nThis function filters data in a pandas DataFrame or a pandas GroupBy object by a given date range. It supports various date formats and can handle both DataFrame and GroupBy objects." + "text": "augment_expanding_apply(data, date_column, window_func, min_periods=None, threads=1, show_progress=True, reduce_memory=False)\nApply one or more DataFrame-based expanding functions to one or more columns of a DataFrame." }, { - "objectID": "reference/filter_by_time.html#parameters", - "href": "reference/filter_by_time.html#parameters", - "title": "filter_by_time", + "objectID": "reference/augment_expanding_apply.html#parameters", + "href": "reference/augment_expanding_apply.html#parameters", + "title": "augment_expanding_apply", "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\npd.DataFrame or pd.core.groupby.generic.DataFrameGroupBy\nThe data to be filtered. It can be a pandas DataFrame or a pandas GroupBy object.\nrequired\n\n\ndate_column\nstr\nThe name of the column in data that contains date information. This column is used for filtering the data based on the date range.\nrequired\n\n\nstart_date\nstr\nThe start date of the filtering range. The format of the date can be YYYY, YYYY-MM, YYYY-MM-DD, YYYY-MM-DD HH, YYYY-MM-DD HH:SS, or YYYY-MM-DD HH:MM:SS. Default: ‘start’, which will filter from the earliest date in the data.\n'start'\n\n\nend_date\nstr\nThe end date of the filtering range. It supports the same formats as start_date. Default: ‘end’, which will filter until the latest date in the data.\n'end'\n\n\nengine\nstr\nThe engine to be used for filtering the data. Currently, only ‘pandas’.\n= 'pandas'" + "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nInput data to be processed. Can be a Pandas DataFrame or a GroupBy object.\nrequired\n\n\ndate_column\nstr\nName of the datetime column. Data is sorted by this column within each group.\nrequired\n\n\nwindow_func\nUnion[Tuple[str, Callable], List[Tuple[str, Callable]]]\nThe window_func parameter in the augment_expanding_apply function specifies the function(s) that operate on a expanding window with the consideration of multiple columns. The specification can be: - A tuple where the first element is a string representing the function’s name and the second element is the callable function itself. - A list of such tuples for multiple functions. Note: For functions targeting only a single value column without the need for contextual data from other columns, consider using the augment_expanding function in this library.\nrequired\n\n\nmin_periods\nint\nMinimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.\nNone\n\n\nthreads\nint\nNumber of threads to use for parallel processing. If threads is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.\n1\n\n\nshow_progress\nbool\nIf True, a progress bar will be displayed during parallel processing.\nTrue\n\n\nreduce_memory\nbool\nThe reduce_memory parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.\nFalse" }, { - "objectID": "reference/filter_by_time.html#returns", - "href": "reference/filter_by_time.html#returns", - "title": "filter_by_time", + "objectID": "reference/augment_expanding_apply.html#returns", + "href": "reference/augment_expanding_apply.html#returns", + "title": "augment_expanding_apply", "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nA pandas DataFrame containing the filtered data within the specified date range." + "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npd.DataFrame\nThe augment_expanding function returns a DataFrame with new columns for each applied function, window size, and value column." }, { - "objectID": "reference/filter_by_time.html#raises", - "href": "reference/filter_by_time.html#raises", - "title": "filter_by_time", - "section": "Raises", - "text": "Raises\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf the provided date strings do not match any of the supported formats." + "objectID": "reference/augment_expanding_apply.html#examples", + "href": "reference/augment_expanding_apply.html#examples", + "title": "augment_expanding_apply", + "section": "Examples", + "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport numpy as np\n\n\n# Example showcasing the expanding correlation between two columns (`value1` and \n# `value2`).\n# The correlation requires both columns as input.\n \n# Sample DataFrame with id, date, value1, and value2 columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [2, 16, 20, 40, 41, 50],\n})\n \n# Compute the expanding correlation for each group of 'id'\nexpanding_df = (\n df.groupby('id')\n .augment_expanding_apply(\n date_column='date',\n window_func=[('corr', lambda x: x['value1'].corr(x['value2']))], # Lambda function for correlation\n threads = 1, # Disable parallel processing\n )\n)\ndisplay(expanding_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nexpanding_corr\n\n\n\n\n0\n1\n2023-01-01\n10\n2\nNaN\n\n\n1\n1\n2023-01-02\n20\n16\n1.000000\n\n\n2\n1\n2023-01-03\n29\n20\n0.961054\n\n\n3\n2\n2023-01-04\n42\n40\nNaN\n\n\n4\n2\n2023-01-05\n53\n41\n1.000000\n\n\n5\n2\n2023-01-06\n59\n50\n0.824831\n\n\n\n\n\n\n\n\n# expanding Regression Example: Using `value1` as the dependent variable and \n# `value2` and `value3` as the independent variables.\n# This example demonstrates how to perform a expanding regression using two \n# independent variables.\n\n# Sample DataFrame with `id`, `date`, `value1`, `value2`, and `value3` columns.\ndf = pd.DataFrame({\n 'id': [1, 1, 1, 2, 2, 2],\n 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),\n 'value1': [10, 20, 29, 42, 53, 59],\n 'value2': [5, 16, 24, 35, 45, 58],\n 'value3': [2, 3, 6, 9, 10, 13]\n})\n \n# Define Regression Function to be applied on the expanding window.\ndef regression(df):\n \n # Required module (scikit-learn) for regression.\n from sklearn.linear_model import LinearRegression\n \n model = LinearRegression()\n X = df[['value2', 'value3']] # Independent variables\n y = df['value1'] # Dependent variable\n model.fit(X, y)\n ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])\n \n return ret # Return intercept and slope as a Series\n \n# Compute the expanding regression for each group of `id`\nresult_df = (\n df.groupby('id')\n .augment_expanding_apply(\n date_column='date',\n window_func=[('regression', regression)],\n threads = 1\n )\n .dropna()\n)\n\n# Format the results to have each regression output (slope and intercept) in \n# separate columns.\nregression_wide_df = pd.concat(result_df['expanding_regression'].to_list(), axis=1).T\nregression_wide_df = pd.concat([result_df.reset_index(drop = True), regression_wide_df], axis=1)\ndisplay(regression_wide_df)\n\n\n\n\n\n\n\n\n\n\n\nid\ndate\nvalue1\nvalue2\nvalue3\nexpanding_regression\nIntercept\nSlope\n\n\n\n\n0\n1\n2023-01-01\n10\n5\n2\nIntercept 10.0 Slope 0.0 dtype: flo...\n10.000000\n0.000000\n\n\n1\n1\n2023-01-02\n20\n16\n3\nIntercept 5.327869 Slope 0.901639 dt...\n5.327869\n0.901639\n\n\n2\n1\n2023-01-03\n29\n24\n6\nIntercept 4.28 Slope 0.84 dtype: flo...\n4.280000\n0.840000\n\n\n3\n2\n2023-01-04\n42\n35\n9\nIntercept 42.0 Slope 0.0 dtype: flo...\n42.000000\n0.000000\n\n\n4\n2\n2023-01-05\n53\n45\n10\nIntercept 2.900990 Slope 1.089109 dt...\n2.900990\n1.089109\n\n\n5\n2\n2023-01-06\n59\n58\n13\nIntercept 30.352941 Slope 1.588235 ...\n30.352941\n1.588235" }, { - "objectID": "reference/filter_by_time.html#notes", - "href": "reference/filter_by_time.html#notes", - "title": "filter_by_time", - "section": "Notes", - "text": "Notes\n\nThe function uses pd.to_datetime to convert the start date (e.g. start_date = “2014” becomes “2014-01-01”).\nThe function internally uses the parse_end_date function to convert the end dates (e.g. end_date = “2014” becomes “2014-12-31”)." + "objectID": "reference/TimeSeriesCVSplitter.html", + "href": "reference/TimeSeriesCVSplitter.html", + "title": "TimeSeriesCVSplitter", + "section": "", + "text": "TimeSeriesCVSplitter(self, *, frequency, train_size, forecast_horizon, time_series, gap=0, stride=None, window='rolling', mode='backward', start_dt=None, end_dt=None, split_limit=None)\nThe TimeSeriesCVSplitter is a scikit-learn compatible cross-validator using TimeSeriesCV.\nThis cross-validator generates splits based on time values, making it suitable for time series data." }, { - "objectID": "reference/filter_by_time.html#examples", - "href": "reference/filter_by_time.html#examples", - "title": "filter_by_time", - "section": "Examples", - "text": "Examples\n\nimport pytimetk as tk\nimport pandas as pd\nimport datetime\n\nm4_daily_df = tk.datasets.load_dataset('m4_daily', parse_dates = ['date'])\n\n\n# Example 1 - Filter by date\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014-07-03',\n end_date = '2014-07-10'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n5\nD10\n2014-07-08\n2017.6\n\n\n6\nD10\n2014-07-09\n2019.1\n\n\n7\nD10\n2014-07-10\n2007.4\n\n\n\n\n\n\n\n\n# Example 2 - Filter by month.\n# Note: This will filter by the first day of the month.\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014-07',\n end_date = '2014-09'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n85\nD10\n2014-09-26\n1987.9\n\n\n86\nD10\n2014-09-27\n1999.8\n\n\n87\nD10\n2014-09-28\n2000.2\n\n\n88\nD10\n2014-09-29\n1996.4\n\n\n89\nD10\n2014-09-30\n2023.5\n\n\n\n\n90 rows × 3 columns\n\n\n\n\n# Example 3 - Filter by year.\n# Note: This will filter by the first day of the year.\n\ndf_filtered = tk.filter_by_time(\n data = m4_daily_df,\n date_column = 'date',\n start_date = '2014',\n end_date = '2014'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nD10\n2014-07-03\n2076.2\n\n\n1\nD10\n2014-07-04\n2073.4\n\n\n2\nD10\n2014-07-05\n2048.7\n\n\n3\nD10\n2014-07-06\n2048.9\n\n\n4\nD10\n2014-07-07\n2006.4\n\n\n...\n...\n...\n...\n\n\n177\nD10\n2014-12-27\n2270.1\n\n\n178\nD10\n2014-12-28\n2322.0\n\n\n179\nD10\n2014-12-29\n2327.3\n\n\n180\nD10\n2014-12-30\n2344.9\n\n\n181\nD10\n2014-12-31\n2327.8\n\n\n\n\n182 rows × 3 columns\n\n\n\n\n# Example 4 - Filter by day/hour/minute/second\n# Here we'll use an hourly dataset, however this will also work for minute/second data\n\n# Load data and format date column appropriately\nm4_hourly_df = tk.datasets.load_dataset('m4_hourly', parse_dates = ['date'])\n\ndf_filtered = tk.filter_by_time(\n data = m4_hourly_df,\n date_column = \"date\",\n start_date = '2015-07-01 12:00:00',\n end_date = '2015-07-01 20:00:00'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n5\nH10\n2015-07-01 17:00:00+00:00\n484\n\n\n6\nH10\n2015-07-01 18:00:00+00:00\n467\n\n\n7\nH10\n2015-07-01 19:00:00+00:00\n446\n\n\n8\nH10\n2015-07-01 20:00:00+00:00\n434\n\n\n700\nH50\n2015-07-01 12:00:00+00:00\n39325\n\n\n701\nH50\n2015-07-01 13:00:00+00:00\n38153\n\n\n702\nH50\n2015-07-01 14:00:00+00:00\n36829\n\n\n703\nH50\n2015-07-01 15:00:00+00:00\n35878\n\n\n704\nH50\n2015-07-01 16:00:00+00:00\n33626\n\n\n705\nH50\n2015-07-01 17:00:00+00:00\n31014\n\n\n706\nH50\n2015-07-01 18:00:00+00:00\n28891\n\n\n707\nH50\n2015-07-01 19:00:00+00:00\n27413\n\n\n708\nH50\n2015-07-01 20:00:00+00:00\n26291\n\n\n\n\n\n\n\n\n# Example 5 - Combine year/month/day/hour/minute/second filters\ndf_filtered = tk.filter_by_time(\n data = m4_hourly_df,\n date_column = \"date\",\n start_date = '2015-07-01',\n end_date = '2015-07-29'\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n...\n...\n...\n...\n\n\n1379\nH50\n2015-07-29 19:00:00+00:00\n30167\n\n\n1380\nH50\n2015-07-29 20:00:00+00:00\n28894\n\n\n1381\nH50\n2015-07-29 21:00:00+00:00\n27949\n\n\n1382\nH50\n2015-07-29 22:00:00+00:00\n27507\n\n\n1383\nH50\n2015-07-29 23:00:00+00:00\n28020\n\n\n\n\n1368 rows × 3 columns\n\n\n\n\n# Example 6 - Filter a GroupBy object\n\ndf_filtered = (\n m4_hourly_df\n .groupby('id')\n .filter_by_time(\n date_column = \"date\",\n start_date = '2015-07-01 12:00:00',\n end_date = '2015-07-01 20:00:00'\n )\n)\n\ndf_filtered\n\n\n\n\n\n\n\n\nid\ndate\nvalue\n\n\n\n\n0\nH10\n2015-07-01 12:00:00+00:00\n513\n\n\n1\nH10\n2015-07-01 13:00:00+00:00\n512\n\n\n2\nH10\n2015-07-01 14:00:00+00:00\n506\n\n\n3\nH10\n2015-07-01 15:00:00+00:00\n500\n\n\n4\nH10\n2015-07-01 16:00:00+00:00\n490\n\n\n5\nH10\n2015-07-01 17:00:00+00:00\n484\n\n\n6\nH10\n2015-07-01 18:00:00+00:00\n467\n\n\n7\nH10\n2015-07-01 19:00:00+00:00\n446\n\n\n8\nH10\n2015-07-01 20:00:00+00:00\n434\n\n\n700\nH50\n2015-07-01 12:00:00+00:00\n39325\n\n\n701\nH50\n2015-07-01 13:00:00+00:00\n38153\n\n\n702\nH50\n2015-07-01 14:00:00+00:00\n36829\n\n\n703\nH50\n2015-07-01 15:00:00+00:00\n35878\n\n\n704\nH50\n2015-07-01 16:00:00+00:00\n33626\n\n\n705\nH50\n2015-07-01 17:00:00+00:00\n31014\n\n\n706\nH50\n2015-07-01 18:00:00+00:00\n28891\n\n\n707\nH50\n2015-07-01 19:00:00+00:00\n27413\n\n\n708\nH50\n2015-07-01 20:00:00+00:00\n26291" + "objectID": "reference/TimeSeriesCVSplitter.html#parameters", + "href": "reference/TimeSeriesCVSplitter.html#parameters", + "title": "TimeSeriesCVSplitter", + "section": "Parameters:", + "text": "Parameters:\nfrequency: str The frequency of the time series (e.g., “days”, “hours”). train_size: int Minimum number of time units in the training set. forecast_horizon: int Number of time units to forecast in each split. time_series: pd.Series A pandas Series or Index representing the time values. gap: int Number of time units to skip between training and testing sets. stride: int Number of time units to move forward after each split. window: str Type of window, either “rolling” or “expanding”. mode: str Order of split generation, “forward” or “backward”. start_dt: pd.Timestamp Start date for the time period. end_dt: pd.Timestamp End date for the time period. split_limit: int Maximum number of splits to generate. If None, all possible splits will be generated." }, { - "objectID": "reference/correlate.html", - "href": "reference/correlate.html", - "title": "correlate", - "section": "", - "text": "correlate(data, target, method='pearson')\nThe correlate function calculates the correlation between a target variable and all other variables in a pandas DataFrame, and returns the results sorted by absolute correlation in descending order." + "objectID": "reference/TimeSeriesCVSplitter.html#raises", + "href": "reference/TimeSeriesCVSplitter.html#raises", + "title": "TimeSeriesCVSplitter", + "section": "Raises:", + "text": "Raises:\nValueError: If the input arrays are incompatible in length with the time series." }, { - "objectID": "reference/correlate.html#parameters", - "href": "reference/correlate.html#parameters", - "title": "correlate", - "section": "Parameters", - "text": "Parameters\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nUnion[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]\nThe data parameter is the input data that you want to calculate correlations for. It can be either a pandas DataFrame or a grouped DataFrame obtained from a groupby operation.\nrequired\n\n\ntarget\nstr\nThe target parameter is a string that represents the column name in the DataFrame for which you want to calculate the correlation with other columns.\nrequired\n\n\nmethod\nstr\nThe method parameter in the correlate function is used to specify the method for calculating the correlation coefficient. The available options for the method parameter are: * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation\n= 'pearson'" + "objectID": "reference/TimeSeriesCVSplitter.html#returns", + "href": "reference/TimeSeriesCVSplitter.html#returns", + "title": "TimeSeriesCVSplitter", + "section": "Returns:", + "text": "Returns:\nA generator of tuples of arrays containing the training and forecast data." }, { - "objectID": "reference/correlate.html#returns", - "href": "reference/correlate.html#returns", - "title": "correlate", - "section": "Returns", - "text": "Returns\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nThe function correlate returns a DataFrame with two columns: ‘feature’ and ‘correlation’. The\n‘feature’ column contains the names of the features in the input data, and the ‘correlation’ column contains the correlation coefficients between each feature and the target variable. The DataFrame is sorted in descending order based on the absolute correlation values." + "objectID": "reference/TimeSeriesCVSplitter.html#see-also", + "href": "reference/TimeSeriesCVSplitter.html#see-also", + "title": "TimeSeriesCVSplitter", + "section": "See Also:", + "text": "See Also:\nTimeSeriesCV" }, { - "objectID": "reference/correlate.html#see-also", - "href": "reference/correlate.html#see-also", - "title": "correlate", - "section": "See Also", - "text": "See Also\n\nbinarize() : Prepares data for correlate, which is used for analyzing correlationfunnel plots." + "objectID": "reference/TimeSeriesCVSplitter.html#examples", + "href": "reference/TimeSeriesCVSplitter.html#examples", + "title": "TimeSeriesCVSplitter", + "section": "Examples", + "text": "Examples\n\nimport pandas as pd\nimport numpy as np\n\nfrom pytimetk import TimeSeriesCVSplitter\n\nstart_dt = pd.Timestamp(2023, 1, 1)\nend_dt = pd.Timestamp(2023, 1, 31)\n\ntime_series = pd.Series(pd.date_range(start_dt, end_dt, freq=\"D\"))\nsize = len(time_series)\n\ndf = pd.DataFrame(data=np.random.randn(size, 2), columns=[\"a\", \"b\"])\n\nX, y = df[[\"a\", \"b\"]], df[[\"a\", \"b\"]].sum(axis=1)\n\ncv = TimeSeriesCVSplitter(\n time_series=time_series,\n frequency=\"days\",\n train_size=14,\n forecast_horizon=7,\n gap=0,\n stride=1,\n window=\"rolling\",\n)\n\ncv\n\nTimeSeriesCVSplitter(end_dt=None, forecast_horizon=None, frequency=None,\n gap=None, mode=None, split_limit=None, start_dt=None,\n stride=None, time_series=None, train_size=None, window=None)\n\n\n\n# Insepct the cross-validation splits\ncv.splitter.plot(y, time_series = time_series)\n\n\n \n\n\n\n# Using the TimeSeriesCVSplitter in a scikit-learn CV model\n\nfrom sklearn.linear_model import Ridge\nfrom sklearn.model_selection import RandomizedSearchCV\n\n# Fit and get best estimator\nparam_grid = {\n \"alpha\": np.linspace(0.1, 2, 10),\n \"fit_intercept\": [True, False],\n \"positive\": [True, False],\n}\n\nrandom_search_cv = RandomizedSearchCV(\n estimator=Ridge(),\n param_distributions=param_grid,\n cv=cv,\n n_jobs=-1,\n).fit(X, y)\n\nrandom_search_cv.best_estimator_\n\nRidge(alpha=np.float64(0.1), fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.  Ridge?Documentation for RidgeiFittedRidge(alpha=np.float64(0.1), fit_intercept=False)" }, { - "objectID": "reference/correlate.html#examples", - "href": "reference/correlate.html#examples", - "title": "correlate", - "section": "Examples", - "text": "Examples\n\n# NON-TIMESERIES EXAMPLE ----\n\nimport pandas as pd\nimport numpy as np\nimport pytimetk as tk\n\n# Set a random seed for reproducibility\nnp.random.seed(0)\n\n# Define the number of rows for your DataFrame\nnum_rows = 200\n\n# Create fake data for the columns\ndata = {\n 'Age': np.random.randint(18, 65, size=num_rows),\n 'Gender': np.random.choice(['Male', 'Female'], size=num_rows),\n 'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),\n 'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),\n 'Years_Playing': np.random.randint(0, 30, size=num_rows),\n 'Average_Income': np.random.randint(20000, 100000, size=num_rows),\n 'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),\n 'Number_Children': np.random.randint(0, 5, size=num_rows),\n 'Own_House_Flag': np.random.choice([True, False], size=num_rows),\n 'Own_Car_Count': np.random.randint(0, 3, size=num_rows),\n 'PersonId': range(1, num_rows + 1), # Add a PersonId column as a row count\n 'Client': np.random.choice(['A', 'B'], size=num_rows) # Add a Client column with random values 'A' or 'B'\n}\n\n# Create a DataFrame\ndf = pd.DataFrame(data)\n\n# Binarize the data\ndf_binarized = df.binarize(n_bins=4, thresh_infreq=0.01, name_infreq=\"-OTHER\", one_hot=True)\n\ndf_binarized.glimpse() \n\n<class 'pandas.core.frame.DataFrame'>: 200 rows of 42 columns\nAge__18.0_29.0: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nAge__29.0_39.0: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nAge__39.0_53.0: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nAge__53.0_64.0: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nYears_Playing__0.0_7.0: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nYears_Playing__7.0_15.0: uint8 [0, 0, 1, 0, 1, 0, 1 ...\nYears_Playing__15.0_22.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nYears_Playing__22.0_29.0: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nAverage_Income__20131.0_40110.2: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nAverage_Income__40110.2_60649.5: uint8 [0, 0, 0, 1, 1, 0, 1 ...\nAverage_Income__60649.5_79904.8: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nAverage_Income__79904.8_99131.0: uint8 [1, 0, 0, 0, 0, 1, 0 ...\nPersonId__1.0_50.8: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nPersonId__50.8_100.5: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__100.5_150.2: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nPersonId__150.2_200.0: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nGender__Female: uint8 [1, 0, 0, 0, 1, 0, 1 ...\nGender__Male: uint8 [0, 1, 1, 1, 0, 1, 0 ...\nMarital_Status__Divorced: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nMarital_Status__Married: uint8 [1, 1, 0, 0, 1, 0, 0 ...\nMarital_Status__Single: uint8 [0, 0, 1, 1, 0, 1, 1 ...\nCity__Chicago: uint8 [0, 0, 1, 0, 0, 1, 0 ...\nCity__Houston: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nCity__Los Angeles: uint8 [0, 0, 0, 0, 0, 0, 0 ...\nCity__Miami: uint8 [0, 1, 0, 0, 0, 0, 0 ...\nCity__New York: uint8 [1, 0, 0, 1, 1, 0, 0 ...\nMember_Status__Bronze: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nMember_Status__Gold: uint8 [0, 0, 0, 0, 0, 1, 1 ...\nMember_Status__Platinum: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nMember_Status__Silver: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nNumber_Children__0: uint8 [0, 0, 1, 0, 0, 0, 0 ...\nNumber_Children__1: uint8 [0, 0, 0, 0, 0, 0, 1 ...\nNumber_Children__2: uint8 [0, 0, 0, 1, 0, 0, 0 ...\nNumber_Children__3: uint8 [0, 1, 0, 0, 0, 1, 0 ...\nNumber_Children__4: uint8 [1, 0, 0, 0, 1, 0, 0 ...\nOwn_House_Flag__0: uint8 [1, 1, 0, 0, 1, 0, 1 ...\nOwn_House_Flag__1: uint8 [0, 0, 1, 1, 0, 1, 0 ...\nOwn_Car_Count__0: uint8 [0, 1, 0, 0, 1, 0, 0 ...\nOwn_Car_Count__1: uint8 [0, 0, 0, 1, 0, 1, 1 ...\nOwn_Car_Count__2: uint8 [1, 0, 1, 0, 0, 0, 0 ...\nClient__A: uint8 [1, 1, 1, 1, 1, 1, 1 ...\nClient__B: uint8 [0, 0, 0, 0, 0, 0, 0 ...\n\n\n\ndf_correlated = df_binarized.correlate(target='Member_Status__Platinum')\ndf_correlated\n\n\n\n\n\n\n\n\nfeature\nbin\ncorrelation\n\n\n\n\n28\nMember_Status\nPlatinum\n1.000000\n\n\n26\nMember_Status\nBronze\n-0.341351\n\n\n29\nMember_Status\nSilver\n-0.332799\n\n\n27\nMember_Status\nGold\n-0.298637\n\n\n30\nNumber_Children\n0\n0.205230\n\n\n8\nAverage_Income\n20131.0_40110.2\n-0.156593\n\n\n0\nAge\n18.0_29.0\n-0.135522\n\n\n11\nAverage_Income\n79904.8_99131.0\n0.115743\n\n\n33\nNumber_Children\n3\n-0.112216\n\n\n7\nYears_Playing\n22.0_29.0\n-0.106763\n\n\n19\nMarital_Status\nMarried\n-0.104562\n\n\n41\nClient\nB\n0.103842\n\n\n40\nClient\nA\n-0.103842\n\n\n9\nAverage_Income\n40110.2_60649.5\n0.088509\n\n\n12\nPersonId\n1.0_50.8\n0.088509\n\n\n38\nOwn_Car_Count\n1\n0.087769\n\n\n22\nCity\nHouston\n0.086124\n\n\n13\nPersonId\n50.8_100.5\n-0.074892\n\n\n2\nAge\n39.0_53.0\n0.074739\n\n\n39\nOwn_Car_Count\n2\n-0.071738\n\n\n31\nNumber_Children\n1\n-0.069054\n\n\n25\nCity\nNew York\n-0.055757\n\n\n18\nMarital_Status\nDivorced\n0.055724\n\n\n1\nAge\n29.0_39.0\n0.054374\n\n\n20\nMarital_Status\nSingle\n0.050286\n\n\n34\nNumber_Children\n4\n-0.047760\n\n\n15\nPersonId\n150.2_200.0\n-0.047659\n\n\n10\nAverage_Income\n60649.5_79904.8\n-0.047659\n\n\n5\nYears_Playing\n7.0_15.0\n0.040717\n\n\n14\nPersonId\n100.5_150.2\n0.034042\n\n\n6\nYears_Playing\n15.0_22.0\n0.034042\n\n\n21\nCity\nChicago\n-0.032799\n\n\n4\nYears_Playing\n0.0_7.0\n0.028391\n\n\n16\nGender\nFemale\n0.020215\n\n\n17\nGender\nMale\n-0.020215\n\n\n35\nOwn_House_Flag\n0\n0.017336\n\n\n36\nOwn_House_Flag\n1\n-0.017336\n\n\n37\nOwn_Car_Count\n0\n-0.016373\n\n\n3\nAge\n53.0_64.0\n0.012002\n\n\n24\nCity\nMiami\n0.010662\n\n\n23\nCity\nLos Angeles\n-0.004911\n\n\n32\nNumber_Children\n2\n0.002104\n\n\n\n\n\n\n\n\n# Interactive\ndf_correlated.plot_correlation_funnel(\n engine='plotly', \n height=400\n)\n\n\n \n\n\n\n# Static\nfig = df_correlated.plot_correlation_funnel(\n engine ='plotnine', \n height = 600\n)\nfig\n\n\n\n\n<Figure Size: (700 x 600)>" + "objectID": "reference/TimeSeriesCVSplitter.html#methods", + "href": "reference/TimeSeriesCVSplitter.html#methods", + "title": "TimeSeriesCVSplitter", + "section": "Methods", + "text": "Methods\n\n\n\nName\nDescription\n\n\n\n\nget_n_splits\nReturns the number of splits.\n\n\nsplit\nGenerates train and test indices for cross-validation.\n\n\n\n\nget_n_splits\nTimeSeriesCVSplitter.get_n_splits(X=None, y=None, groups=None)\nReturns the number of splits.\n\n\nsplit\nTimeSeriesCVSplitter.split(X=None, y=None, groups=None)\nGenerates train and test indices for cross-validation.\n\nParameters:\nX: Optional input features (ignored, for compatibility with scikit-learn). y: Optional target variable (ignored, for compatibility with scikit-learn). groups: Optional group labels (ignored, for compatibility with scikit-learn).\n\n\nYields:\nTuple[np.ndarray, np.ndarray]: Tuples of train and test indices." }, { "objectID": "reference/week_of_month.html", @@ -2412,7 +2461,7 @@ "href": "reference/TimeSeriesCV.html#raises", "title": "TimeSeriesCV", "section": "Raises:", - "text": "Raises:\nValueError: - If frequency is not one of “days”, “seconds”, “microseconds”, “milliseconds”, “minutes”, “hours”, “weeks”. - If window is not one of “rolling” or “expanding”. - If mode is not one of “forward” or “backward” - If train_size, forecast_horizon, gap or stride are not strictly positive.\nTypeError: If train_size, forecast_horizon, gap or stride are not of type int." + "text": "Raises:\nValueError:\n\nIf frequency is not one of “days”, “seconds”, “microseconds”, “milliseconds”, “minutes”, “hours”, “weeks”.\nIf window is not one of “rolling” or “expanding”.\nIf mode is not one of “forward” or “backward”\nIf train_size, forecast_horizon, gap or stride are not strictly positive.\n\nTypeError:\nIf train_size, forecast_horizon, gap or stride are not of type int." }, { "objectID": "reference/TimeSeriesCV.html#examples", diff --git a/docs/_site/sitemap.xml b/docs/_site/sitemap.xml index 222d9525..a62a7dea 100644 --- a/docs/_site/sitemap.xml +++ b/docs/_site/sitemap.xml @@ -2,346 +2,350 @@ https://business-science.github.io/pytimetk/reference/ts_features.html - 2024-11-06T01:33:22.832Z + 2024-11-06T02:17:26.592Z https://business-science.github.io/pytimetk/reference/floor_date.html - 2024-11-06T01:33:19.952Z + 2024-11-06T02:17:23.752Z https://business-science.github.io/pytimetk/reference/ts_summary.html - 2024-11-06T01:33:17.551Z + 2024-11-06T02:17:21.443Z https://business-science.github.io/pytimetk/reference/timeseries_unit_frequency_table.html - 2024-11-06T01:33:14.655Z + 2024-11-06T02:17:18.802Z https://business-science.github.io/pytimetk/reference/transform_columns.html - 2024-11-06T01:33:12.909Z + 2024-11-06T02:17:17.103Z https://business-science.github.io/pytimetk/reference/augment_timeseries_signature.html - 2024-11-06T01:33:11.399Z + 2024-11-06T02:17:15.629Z https://business-science.github.io/pytimetk/reference/plot_anomalies.html - 2024-11-06T01:33:09.397Z + 2024-11-06T02:17:13.709Z https://business-science.github.io/pytimetk/reference/augment_ewm.html - 2024-11-06T01:33:06.466Z + 2024-11-06T02:17:11.023Z https://business-science.github.io/pytimetk/reference/make_future_timeseries.html - 2024-11-06T01:33:04.715Z + 2024-11-06T02:17:09.301Z https://business-science.github.io/pytimetk/reference/augment_roc.html - 2024-11-06T01:33:03.239Z + 2024-11-06T02:17:07.942Z https://business-science.github.io/pytimetk/reference/get_frequency_summary.html - 2024-11-06T01:33:01.524Z + 2024-11-06T02:17:06.337Z https://business-science.github.io/pytimetk/reference/augment_qsmomentum.html - 2024-11-06T01:32:59.751Z + 2024-11-06T02:17:04.634Z https://business-science.github.io/pytimetk/reference/augment_hilbert.html - 2024-11-06T01:32:57.565Z + 2024-11-06T02:17:02.496Z https://business-science.github.io/pytimetk/reference/progress_apply.html - 2024-11-06T01:32:54.699Z + 2024-11-06T02:16:59.750Z https://business-science.github.io/pytimetk/reference/flatten_multiindex_column_names.html - 2024-11-06T01:32:52.948Z + 2024-11-06T02:16:58.156Z https://business-science.github.io/pytimetk/reference/make_weekday_sequence.html - 2024-11-06T01:32:39.651Z + 2024-11-06T02:16:45.978Z https://business-science.github.io/pytimetk/reference/augment_pct_change.html - 2024-11-06T01:32:38.220Z + 2024-11-06T02:16:44.613Z https://business-science.github.io/pytimetk/reference/get_trend_frequency.html - 2024-11-06T01:32:35.293Z + 2024-11-06T02:16:41.896Z https://business-science.github.io/pytimetk/reference/augment_fourier.html - 2024-11-06T01:32:33.736Z + 2024-11-06T02:16:40.338Z - https://business-science.github.io/pytimetk/reference/augment_expanding_apply.html - 2024-11-06T01:32:31.447Z + https://business-science.github.io/pytimetk/reference/correlate.html + 2024-11-06T02:16:16.482Z - https://business-science.github.io/pytimetk/reference/parallel_apply.html - 2024-11-06T01:32:28.851Z + https://business-science.github.io/pytimetk/reference/filter_by_time.html + 2024-11-06T02:16:14.373Z - https://business-science.github.io/pytimetk/reference/plot_anomalies_decomp.html - 2024-11-06T01:32:25.774Z + https://business-science.github.io/pytimetk/reference/summarize_by_time.html + 2024-11-06T02:16:11.976Z - https://business-science.github.io/pytimetk/reference/apply_by_time.html - 2024-11-06T01:32:24.071Z + https://business-science.github.io/pytimetk/reference/drop_zero_variance.html + 2024-11-06T02:16:08.761Z - https://business-science.github.io/pytimetk/reference/augment_bbands.html - 2024-11-06T01:32:21.656Z + https://business-science.github.io/pytimetk/reference/index.html + 2024-11-06T02:16:07.024Z - https://business-science.github.io/pytimetk/reference/augment_ppo.html - 2024-11-06T01:32:20.060Z + https://business-science.github.io/pytimetk/reference/get_pandas_frequency.html + 2024-11-06T02:16:05.021Z - https://business-science.github.io/pytimetk/reference/glimpse.html - 2024-11-06T01:32:18.300Z + https://business-science.github.io/pytimetk/reference/palette_timetk.html + 2024-11-06T02:16:03.568Z - https://business-science.github.io/pytimetk/reference/plot_correlation_funnel.html - 2024-11-06T01:32:16.699Z + https://business-science.github.io/pytimetk/reference/get_date_summary.html + 2024-11-06T02:16:02.070Z - https://business-science.github.io/pytimetk/reference/get_seasonal_frequency.html - 2024-11-06T01:32:14.725Z + https://business-science.github.io/pytimetk/reference/is_holiday.html + 2024-11-06T02:16:00.398Z - https://business-science.github.io/pytimetk/reference/plot_anomaly_decomp.html - 2024-11-06T01:32:13.100Z + https://business-science.github.io/pytimetk/reference/get_available_datasets.html + 2024-11-06T02:15:58.808Z - https://business-science.github.io/pytimetk/reference/plot_anomalies_cleaned.html - 2024-11-06T01:32:10.749Z + https://business-science.github.io/pytimetk/reference/augment_wavelet.html + 2024-11-06T02:15:57.063Z - https://business-science.github.io/pytimetk/reference/pad_by_time.html - 2024-11-06T01:32:07.944Z + https://business-science.github.io/pytimetk/reference/augment_cmo.html + 2024-11-06T02:15:54.877Z - https://business-science.github.io/pytimetk/reference/time_scale_template.html - 2024-11-06T01:32:05.013Z + https://business-science.github.io/pytimetk/reference/augment_expanding.html + 2024-11-06T02:15:51.885Z - https://business-science.github.io/pytimetk/reference/augment_diffs.html - 2024-11-06T01:32:03.294Z + https://business-science.github.io/pytimetk/reference/ceil_date.html + 2024-11-06T02:15:49.821Z - https://business-science.github.io/pytimetk/reference/plot_timeseries.html - 2024-11-06T01:32:01.115Z + https://business-science.github.io/pytimetk/reference/augment_rolling_apply.html + 2024-11-06T02:15:47.928Z - https://business-science.github.io/pytimetk/changelog-news.html - 2024-11-06T01:31:58.106Z + https://business-science.github.io/pytimetk/reference/augment_rolling.html + 2024-11-06T02:15:45.551Z - https://business-science.github.io/pytimetk/tutorials/02_finance.html - 2024-11-06T01:31:55.733Z + https://business-science.github.io/pytimetk/performance/01_speed_comparisons.html + 2024-11-06T02:15:43.076Z - https://business-science.github.io/pytimetk/tutorials/03_demand_forecasting.html - 2024-11-06T01:31:43.321Z + https://business-science.github.io/pytimetk/tutorials/01_sales_crm.html + 2024-11-06T02:15:33.832Z - https://business-science.github.io/pytimetk/tutorials/06_correlationfunnel.html - 2024-11-06T01:31:39.100Z + https://business-science.github.io/pytimetk/tutorials/05_clustering.html + 2024-11-06T02:15:25.874Z - https://business-science.github.io/pytimetk/guides/05_augmenting.html - 2024-11-06T01:31:33.740Z + https://business-science.github.io/pytimetk/tutorials/04_anomaly_detection.html + 2024-11-06T02:15:24.081Z - https://business-science.github.io/pytimetk/guides/06_anomalize.html - 2024-11-06T01:31:28.933Z + https://business-science.github.io/pytimetk/guides/04_wrangling.html + 2024-11-06T02:15:22.036Z - https://business-science.github.io/pytimetk/guides/03_pandas_frequency.html - 2024-11-06T01:31:23.641Z + https://business-science.github.io/pytimetk/guides/01_visualization.html + 2024-11-06T02:15:16.637Z - https://business-science.github.io/pytimetk/getting-started/01_installation.html - 2024-11-06T01:31:21.211Z + https://business-science.github.io/pytimetk/guides/02_timetk_concepts.html + 2024-11-06T02:15:14.326Z - https://business-science.github.io/pytimetk/contributing.html - 2024-11-06T01:31:18.806Z + https://business-science.github.io/pytimetk/getting-started/02_quick_start.html + 2024-11-06T02:15:09.441Z https://business-science.github.io/pytimetk/index.html - 2024-11-06T01:31:20.264Z + 2024-11-06T02:15:07.644Z - https://business-science.github.io/pytimetk/getting-started/02_quick_start.html - 2024-11-06T01:31:22.583Z + https://business-science.github.io/pytimetk/contributing.html + 2024-11-06T02:15:06.410Z - https://business-science.github.io/pytimetk/guides/02_timetk_concepts.html - 2024-11-06T01:31:27.797Z + https://business-science.github.io/pytimetk/getting-started/01_installation.html + 2024-11-06T02:15:08.343Z - https://business-science.github.io/pytimetk/guides/01_visualization.html - 2024-11-06T01:31:30.310Z + https://business-science.github.io/pytimetk/guides/03_pandas_frequency.html + 2024-11-06T02:15:10.423Z - https://business-science.github.io/pytimetk/guides/04_wrangling.html - 2024-11-06T01:31:35.826Z + https://business-science.github.io/pytimetk/guides/06_anomalize.html + 2024-11-06T02:15:15.418Z - https://business-science.github.io/pytimetk/tutorials/04_anomaly_detection.html - 2024-11-06T01:31:38.024Z + https://business-science.github.io/pytimetk/guides/05_augmenting.html + 2024-11-06T02:15:20.041Z - https://business-science.github.io/pytimetk/tutorials/05_clustering.html - 2024-11-06T01:31:39.931Z + https://business-science.github.io/pytimetk/tutorials/06_correlationfunnel.html + 2024-11-06T02:15:25.125Z - https://business-science.github.io/pytimetk/tutorials/01_sales_crm.html - 2024-11-06T01:31:47.723Z + https://business-science.github.io/pytimetk/tutorials/03_demand_forecasting.html + 2024-11-06T02:15:29.467Z - https://business-science.github.io/pytimetk/performance/01_speed_comparisons.html - 2024-11-06T01:31:56.945Z + https://business-science.github.io/pytimetk/tutorials/02_finance.html + 2024-11-06T02:15:41.916Z - https://business-science.github.io/pytimetk/reference/augment_rolling.html - 2024-11-06T01:31:59.633Z + https://business-science.github.io/pytimetk/changelog-news.html + 2024-11-06T02:15:44.115Z - https://business-science.github.io/pytimetk/reference/augment_rolling_apply.html - 2024-11-06T01:32:02.142Z + https://business-science.github.io/pytimetk/reference/plot_timeseries.html + 2024-11-06T02:15:46.943Z - https://business-science.github.io/pytimetk/reference/ceil_date.html - 2024-11-06T01:32:04.070Z + https://business-science.github.io/pytimetk/reference/augment_diffs.html + 2024-11-06T02:15:49.038Z - https://business-science.github.io/pytimetk/reference/augment_expanding.html - 2024-11-06T01:32:06.347Z + https://business-science.github.io/pytimetk/reference/time_scale_template.html + 2024-11-06T02:15:50.577Z - https://business-science.github.io/pytimetk/reference/augment_cmo.html - 2024-11-06T01:32:09.605Z + https://business-science.github.io/pytimetk/reference/pad_by_time.html + 2024-11-06T02:15:53.354Z - https://business-science.github.io/pytimetk/reference/augment_wavelet.html - 2024-11-06T01:32:12.042Z + https://business-science.github.io/pytimetk/reference/plot_anomalies_cleaned.html + 2024-11-06T02:15:55.902Z - https://business-science.github.io/pytimetk/reference/get_available_datasets.html - 2024-11-06T01:32:13.854Z + https://business-science.github.io/pytimetk/reference/plot_anomaly_decomp.html + 2024-11-06T02:15:58.035Z - https://business-science.github.io/pytimetk/reference/is_holiday.html - 2024-11-06T01:32:15.652Z + https://business-science.github.io/pytimetk/reference/get_seasonal_frequency.html + 2024-11-06T02:15:59.583Z - https://business-science.github.io/pytimetk/reference/get_date_summary.html - 2024-11-06T01:32:17.512Z + https://business-science.github.io/pytimetk/reference/plot_correlation_funnel.html + 2024-11-06T02:16:01.332Z - https://business-science.github.io/pytimetk/reference/palette_timetk.html - 2024-11-06T01:32:19.060Z + https://business-science.github.io/pytimetk/reference/glimpse.html + 2024-11-06T02:16:02.842Z - https://business-science.github.io/pytimetk/reference/get_pandas_frequency.html - 2024-11-06T01:32:20.566Z + https://business-science.github.io/pytimetk/reference/augment_ppo.html + 2024-11-06T02:16:04.533Z - https://business-science.github.io/pytimetk/reference/index.html - 2024-11-06T01:32:22.708Z + https://business-science.github.io/pytimetk/reference/augment_bbands.html + 2024-11-06T02:16:06.008Z - https://business-science.github.io/pytimetk/reference/drop_zero_variance.html - 2024-11-06T01:32:24.626Z + https://business-science.github.io/pytimetk/reference/apply_by_time.html + 2024-11-06T02:16:08.268Z - https://business-science.github.io/pytimetk/reference/summarize_by_time.html - 2024-11-06T01:32:27.832Z + https://business-science.github.io/pytimetk/reference/plot_anomalies_decomp.html + 2024-11-06T02:16:09.793Z - https://business-science.github.io/pytimetk/reference/filter_by_time.html - 2024-11-06T01:32:30.358Z + https://business-science.github.io/pytimetk/reference/parallel_apply.html + 2024-11-06T02:16:13.030Z - https://business-science.github.io/pytimetk/reference/correlate.html - 2024-11-06T01:32:32.591Z + https://business-science.github.io/pytimetk/reference/augment_expanding_apply.html + 2024-11-06T02:16:15.411Z + + + https://business-science.github.io/pytimetk/reference/TimeSeriesCVSplitter.html + 2024-11-06T02:16:39.207Z https://business-science.github.io/pytimetk/reference/week_of_month.html - 2024-11-06T01:32:34.514Z + 2024-11-06T02:16:41.132Z https://business-science.github.io/pytimetk/reference/future_frame.html - 2024-11-06T01:32:37.006Z + 2024-11-06T02:16:43.539Z https://business-science.github.io/pytimetk/reference/get_frequency.html - 2024-11-06T01:32:38.833Z + 2024-11-06T02:16:45.155Z https://business-science.github.io/pytimetk/reference/TimeSeriesCV.html - 2024-11-06T01:32:52.140Z + 2024-11-06T02:16:57.347Z https://business-science.github.io/pytimetk/reference/theme_timetk.html - 2024-11-06T01:32:53.905Z + 2024-11-06T02:16:58.969Z https://business-science.github.io/pytimetk/reference/augment_rsi.html - 2024-11-06T01:32:56.246Z + 2024-11-06T02:17:01.235Z https://business-science.github.io/pytimetk/reference/augment_leads.html - 2024-11-06T01:32:58.905Z + 2024-11-06T02:17:03.796Z https://business-science.github.io/pytimetk/reference/binarize.html - 2024-11-06T01:33:00.716Z + 2024-11-06T02:17:05.527Z https://business-science.github.io/pytimetk/reference/load_dataset.html - 2024-11-06T01:33:02.076Z + 2024-11-06T02:17:06.885Z https://business-science.github.io/pytimetk/reference/get_diff_summary.html - 2024-11-06T01:33:03.887Z + 2024-11-06T02:17:08.489Z https://business-science.github.io/pytimetk/reference/make_weekend_sequence.html - 2024-11-06T01:33:05.533Z + 2024-11-06T02:17:10.120Z https://business-science.github.io/pytimetk/reference/get_holiday_signature.html - 2024-11-06T01:33:07.940Z + 2024-11-06T02:17:12.423Z https://business-science.github.io/pytimetk/reference/augment_atr.html - 2024-11-06T01:33:10.458Z + 2024-11-06T02:17:14.767Z https://business-science.github.io/pytimetk/reference/augment_macd.html - 2024-11-06T01:33:12.404Z + 2024-11-06T02:17:16.615Z https://business-science.github.io/pytimetk/reference/get_timeseries_signature.html - 2024-11-06T01:33:13.817Z + 2024-11-06T02:17:17.999Z https://business-science.github.io/pytimetk/reference/augment_lags.html - 2024-11-06T01:33:16.096Z + 2024-11-06T02:17:20.071Z https://business-science.github.io/pytimetk/reference/augment_holiday_signature.html - 2024-11-06T01:33:19.139Z + 2024-11-06T02:17:22.983Z https://business-science.github.io/pytimetk/reference/anomalize.html - 2024-11-06T01:33:21.866Z + 2024-11-06T02:17:25.624Z diff --git a/docs/objects.json b/docs/objects.json index 1cf23e59..795c3645 100644 --- a/docs/objects.json +++ b/docs/objects.json @@ -1 +1 @@ -{"project": "pytimetk", "version": "0.0.9999", "count": 136, "items": [{"name": "pytimetk.plot_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_timeseries.html#pytimetk.plot_timeseries", "dispname": "-"}, {"name": "pytimetk.plot.plot_timeseries.plot_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_timeseries.html#pytimetk.plot_timeseries", "dispname": "pytimetk.plot_timeseries"}, {"name": "pytimetk.summarize_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/summarize_by_time.html#pytimetk.summarize_by_time", "dispname": "-"}, {"name": "pytimetk.core.summarize_by_time.summarize_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/summarize_by_time.html#pytimetk.summarize_by_time", "dispname": "pytimetk.summarize_by_time"}, {"name": "pytimetk.apply_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/apply_by_time.html#pytimetk.apply_by_time", "dispname": "-"}, {"name": "pytimetk.core.apply_by_time.apply_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/apply_by_time.html#pytimetk.apply_by_time", "dispname": "pytimetk.apply_by_time"}, {"name": "pytimetk.pad_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/pad_by_time.html#pytimetk.pad_by_time", "dispname": "-"}, {"name": "pytimetk.core.pad.pad_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/pad_by_time.html#pytimetk.pad_by_time", "dispname": "pytimetk.pad_by_time"}, {"name": "pytimetk.filter_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/filter_by_time.html#pytimetk.filter_by_time", "dispname": "-"}, {"name": "pytimetk.core.filter_by_time.filter_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/filter_by_time.html#pytimetk.filter_by_time", "dispname": "pytimetk.filter_by_time"}, {"name": "pytimetk.future_frame", "domain": "py", "role": "function", "priority": "1", "uri": "reference/future_frame.html#pytimetk.future_frame", "dispname": "-"}, {"name": "pytimetk.core.future.future_frame", "domain": "py", "role": "function", "priority": "1", "uri": "reference/future_frame.html#pytimetk.future_frame", "dispname": "pytimetk.future_frame"}, {"name": "pytimetk.anomalize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/anomalize.html#pytimetk.anomalize", "dispname": "-"}, {"name": "pytimetk.core.anomalize.anomalize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/anomalize.html#pytimetk.anomalize", "dispname": "pytimetk.anomalize"}, {"name": "pytimetk.plot_anomalies", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies.html#pytimetk.plot_anomalies", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies.plot_anomalies", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies.html#pytimetk.plot_anomalies", "dispname": "pytimetk.plot_anomalies"}, {"name": "pytimetk.plot_anomalies_decomp", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_decomp.html#pytimetk.plot_anomalies_decomp", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies_decomp.plot_anomalies_decomp", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_decomp.html#pytimetk.plot_anomalies_decomp", "dispname": "pytimetk.plot_anomalies_decomp"}, {"name": "pytimetk.plot_anomalies_cleaned", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_cleaned.html#pytimetk.plot_anomalies_cleaned", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies_cleaned.plot_anomalies_cleaned", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_cleaned.html#pytimetk.plot_anomalies_cleaned", "dispname": "pytimetk.plot_anomalies_cleaned"}, {"name": "pytimetk.binarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/binarize.html#pytimetk.binarize", "dispname": "-"}, {"name": "pytimetk.core.correlationfunnel.binarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/binarize.html#pytimetk.binarize", "dispname": "pytimetk.binarize"}, {"name": "pytimetk.correlate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/correlate.html#pytimetk.correlate", "dispname": "-"}, {"name": "pytimetk.core.correlationfunnel.correlate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/correlate.html#pytimetk.correlate", "dispname": "pytimetk.correlate"}, {"name": "pytimetk.plot_correlation_funnel", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_correlation_funnel.html#pytimetk.plot_correlation_funnel", "dispname": "-"}, {"name": "pytimetk.plot.plot_correlation_funnel.plot_correlation_funnel", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_correlation_funnel.html#pytimetk.plot_correlation_funnel", "dispname": "pytimetk.plot_correlation_funnel"}, {"name": "pytimetk.augment_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_timeseries_signature.html#pytimetk.augment_timeseries_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.timeseries_signature.augment_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_timeseries_signature.html#pytimetk.augment_timeseries_signature", "dispname": "pytimetk.augment_timeseries_signature"}, {"name": "pytimetk.augment_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_holiday_signature.html#pytimetk.augment_holiday_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.holiday_signature.augment_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_holiday_signature.html#pytimetk.augment_holiday_signature", "dispname": "pytimetk.augment_holiday_signature"}, {"name": "pytimetk.augment_lags", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_lags.html#pytimetk.augment_lags", "dispname": "-"}, {"name": "pytimetk.feature_engineering.lags.augment_lags", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_lags.html#pytimetk.augment_lags", "dispname": "pytimetk.augment_lags"}, {"name": "pytimetk.augment_leads", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_leads.html#pytimetk.augment_leads", "dispname": "-"}, {"name": "pytimetk.feature_engineering.leads.augment_leads", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_leads.html#pytimetk.augment_leads", "dispname": "pytimetk.augment_leads"}, {"name": "pytimetk.augment_diffs", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_diffs.html#pytimetk.augment_diffs", "dispname": "-"}, {"name": "pytimetk.feature_engineering.diffs.augment_diffs", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_diffs.html#pytimetk.augment_diffs", "dispname": "pytimetk.augment_diffs"}, {"name": "pytimetk.augment_pct_change", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_pct_change.html#pytimetk.augment_pct_change", "dispname": "-"}, {"name": "pytimetk.feature_engineering.pct_change.augment_pct_change", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_pct_change.html#pytimetk.augment_pct_change", "dispname": "pytimetk.augment_pct_change"}, {"name": "pytimetk.augment_rolling", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling.html#pytimetk.augment_rolling", "dispname": "-"}, {"name": "pytimetk.feature_engineering.rolling.augment_rolling", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling.html#pytimetk.augment_rolling", "dispname": "pytimetk.augment_rolling"}, {"name": "pytimetk.augment_rolling_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling_apply.html#pytimetk.augment_rolling_apply", "dispname": "-"}, {"name": "pytimetk.feature_engineering.rolling_apply.augment_rolling_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling_apply.html#pytimetk.augment_rolling_apply", "dispname": "pytimetk.augment_rolling_apply"}, {"name": "pytimetk.augment_expanding", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding.html#pytimetk.augment_expanding", "dispname": "-"}, {"name": "pytimetk.feature_engineering.expanding.augment_expanding", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding.html#pytimetk.augment_expanding", "dispname": "pytimetk.augment_expanding"}, {"name": "pytimetk.augment_expanding_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding_apply.html#pytimetk.augment_expanding_apply", "dispname": "-"}, {"name": "pytimetk.feature_engineering.expanding_apply.augment_expanding_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding_apply.html#pytimetk.augment_expanding_apply", "dispname": "pytimetk.augment_expanding_apply"}, {"name": "pytimetk.augment_ewm", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ewm.html#pytimetk.augment_ewm", "dispname": "-"}, {"name": "pytimetk.feature_engineering.ewm.augment_ewm", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ewm.html#pytimetk.augment_ewm", "dispname": "pytimetk.augment_ewm"}, {"name": "pytimetk.augment_fourier", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_fourier.html#pytimetk.augment_fourier", "dispname": "-"}, {"name": "pytimetk.feature_engineering.fourier.augment_fourier", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_fourier.html#pytimetk.augment_fourier", "dispname": "pytimetk.augment_fourier"}, {"name": "pytimetk.augment_hilbert", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_hilbert.html#pytimetk.augment_hilbert", "dispname": "-"}, {"name": "pytimetk.feature_engineering.hilbert.augment_hilbert", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_hilbert.html#pytimetk.augment_hilbert", "dispname": "pytimetk.augment_hilbert"}, {"name": "pytimetk.augment_wavelet", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_wavelet.html#pytimetk.augment_wavelet", "dispname": "-"}, {"name": "pytimetk.feature_engineering.wavelet.augment_wavelet", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_wavelet.html#pytimetk.augment_wavelet", "dispname": "pytimetk.augment_wavelet"}, {"name": "pytimetk.ts_features", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_features.html#pytimetk.ts_features", "dispname": "-"}, {"name": "pytimetk.core.ts_features.ts_features", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_features.html#pytimetk.ts_features", "dispname": "pytimetk.ts_features"}, {"name": "pytimetk.ts_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_summary.html#pytimetk.ts_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.ts_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_summary.html#pytimetk.ts_summary", "dispname": "pytimetk.ts_summary"}, {"name": "pytimetk.TimeSeriesCV.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.glimpse", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.glimpse", "dispname": "pytimetk.TimeSeriesCV.glimpse"}, {"name": "pytimetk.TimeSeriesCV.plot", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.plot", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.plot", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.plot", "dispname": "pytimetk.TimeSeriesCV.plot"}, {"name": "pytimetk.TimeSeriesCV.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.split", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.split", "dispname": "pytimetk.TimeSeriesCV.split"}, {"name": "pytimetk.TimeSeriesCV", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV", "dispname": "pytimetk.TimeSeriesCV"}, {"name": "pytimetk.augment_macd", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_macd.html#pytimetk.augment_macd", "dispname": "-"}, {"name": "pytimetk.finance.macd.augment_macd", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_macd.html#pytimetk.augment_macd", "dispname": "pytimetk.augment_macd"}, {"name": "pytimetk.augment_ppo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ppo.html#pytimetk.augment_ppo", "dispname": "-"}, {"name": "pytimetk.finance.ppo.augment_ppo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ppo.html#pytimetk.augment_ppo", "dispname": "pytimetk.augment_ppo"}, {"name": "pytimetk.augment_rsi", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rsi.html#pytimetk.augment_rsi", "dispname": "-"}, {"name": "pytimetk.finance.rsi.augment_rsi", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rsi.html#pytimetk.augment_rsi", "dispname": "pytimetk.augment_rsi"}, {"name": "pytimetk.augment_cmo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_cmo.html#pytimetk.augment_cmo", "dispname": "-"}, {"name": "pytimetk.finance.cmo.augment_cmo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_cmo.html#pytimetk.augment_cmo", "dispname": "pytimetk.augment_cmo"}, {"name": "pytimetk.augment_roc", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_roc.html#pytimetk.augment_roc", "dispname": "-"}, {"name": "pytimetk.finance.roc.augment_roc", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_roc.html#pytimetk.augment_roc", "dispname": "pytimetk.augment_roc"}, {"name": "pytimetk.augment_qsmomentum", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_qsmomentum.html#pytimetk.augment_qsmomentum", "dispname": "-"}, {"name": "pytimetk.finance.qsmomentum.augment_qsmomentum", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_qsmomentum.html#pytimetk.augment_qsmomentum", "dispname": "pytimetk.augment_qsmomentum"}, {"name": "pytimetk.augment_bbands", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_bbands.html#pytimetk.augment_bbands", "dispname": "-"}, {"name": "pytimetk.finance.bbands.augment_bbands", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_bbands.html#pytimetk.augment_bbands", "dispname": "pytimetk.augment_bbands"}, {"name": "pytimetk.augment_atr", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_atr.html#pytimetk.augment_atr", "dispname": "-"}, {"name": "pytimetk.finance.atr.augment_atr", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_atr.html#pytimetk.augment_atr", "dispname": "pytimetk.augment_atr"}, {"name": "pytimetk.make_future_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_future_timeseries.html#pytimetk.make_future_timeseries", "dispname": "-"}, {"name": "pytimetk.core.make_future_timeseries.make_future_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_future_timeseries.html#pytimetk.make_future_timeseries", "dispname": "pytimetk.make_future_timeseries"}, {"name": "pytimetk.make_weekday_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekday_sequence.html#pytimetk.make_weekday_sequence", "dispname": "-"}, {"name": "pytimetk.core.make_timeseries_sequence.make_weekday_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekday_sequence.html#pytimetk.make_weekday_sequence", "dispname": "pytimetk.make_weekday_sequence"}, {"name": "pytimetk.make_weekend_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekend_sequence.html#pytimetk.make_weekend_sequence", "dispname": "-"}, {"name": "pytimetk.core.make_timeseries_sequence.make_weekend_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekend_sequence.html#pytimetk.make_weekend_sequence", "dispname": "pytimetk.make_weekend_sequence"}, {"name": "pytimetk.get_date_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_date_summary.html#pytimetk.get_date_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.get_date_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_date_summary.html#pytimetk.get_date_summary", "dispname": "pytimetk.get_date_summary"}, {"name": "pytimetk.get_frequency_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency_summary.html#pytimetk.get_frequency_summary", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_frequency_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency_summary.html#pytimetk.get_frequency_summary", "dispname": "pytimetk.get_frequency_summary"}, {"name": "pytimetk.get_diff_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_diff_summary.html#pytimetk.get_diff_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.get_diff_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_diff_summary.html#pytimetk.get_diff_summary", "dispname": "pytimetk.get_diff_summary"}, {"name": "pytimetk.get_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency.html#pytimetk.get_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency.html#pytimetk.get_frequency", "dispname": "pytimetk.get_frequency"}, {"name": "pytimetk.get_seasonal_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_seasonal_frequency.html#pytimetk.get_seasonal_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_seasonal_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_seasonal_frequency.html#pytimetk.get_seasonal_frequency", "dispname": "pytimetk.get_seasonal_frequency"}, {"name": "pytimetk.get_trend_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_trend_frequency.html#pytimetk.get_trend_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_trend_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_trend_frequency.html#pytimetk.get_trend_frequency", "dispname": "pytimetk.get_trend_frequency"}, {"name": "pytimetk.get_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_timeseries_signature.html#pytimetk.get_timeseries_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.timeseries_signature.get_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_timeseries_signature.html#pytimetk.get_timeseries_signature", "dispname": "pytimetk.get_timeseries_signature"}, {"name": "pytimetk.get_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_holiday_signature.html#pytimetk.get_holiday_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.holiday_signature.get_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_holiday_signature.html#pytimetk.get_holiday_signature", "dispname": "pytimetk.get_holiday_signature"}, {"name": "pytimetk.floor_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/floor_date.html#pytimetk.floor_date", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.floor_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/floor_date.html#pytimetk.floor_date", "dispname": "pytimetk.floor_date"}, {"name": "pytimetk.ceil_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ceil_date.html#pytimetk.ceil_date", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.ceil_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ceil_date.html#pytimetk.ceil_date", "dispname": "pytimetk.ceil_date"}, {"name": "pytimetk.is_holiday", "domain": "py", "role": "function", "priority": "1", "uri": "reference/is_holiday.html#pytimetk.is_holiday", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.is_holiday", "domain": "py", "role": "function", "priority": "1", "uri": "reference/is_holiday.html#pytimetk.is_holiday", "dispname": "pytimetk.is_holiday"}, {"name": "pytimetk.week_of_month", "domain": "py", "role": "function", "priority": "1", "uri": "reference/week_of_month.html#pytimetk.week_of_month", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.week_of_month", "domain": "py", "role": "function", "priority": "1", "uri": "reference/week_of_month.html#pytimetk.week_of_month", "dispname": "pytimetk.week_of_month"}, {"name": "pytimetk.timeseries_unit_frequency_table", "domain": "py", "role": "function", "priority": "1", "uri": "reference/timeseries_unit_frequency_table.html#pytimetk.timeseries_unit_frequency_table", "dispname": "-"}, {"name": "pytimetk.core.frequency.timeseries_unit_frequency_table", "domain": "py", "role": "function", "priority": "1", "uri": "reference/timeseries_unit_frequency_table.html#pytimetk.timeseries_unit_frequency_table", "dispname": "pytimetk.timeseries_unit_frequency_table"}, {"name": "pytimetk.time_scale_template", "domain": "py", "role": "function", "priority": "1", "uri": "reference/time_scale_template.html#pytimetk.time_scale_template", "dispname": "-"}, {"name": "pytimetk.core.frequency.time_scale_template", "domain": "py", "role": "function", "priority": "1", "uri": "reference/time_scale_template.html#pytimetk.time_scale_template", "dispname": "pytimetk.time_scale_template"}, {"name": "pytimetk.theme_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/theme_timetk.html#pytimetk.theme_timetk", "dispname": "-"}, {"name": "pytimetk.plot.theme.theme_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/theme_timetk.html#pytimetk.theme_timetk", "dispname": "pytimetk.theme_timetk"}, {"name": "pytimetk.palette_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/palette_timetk.html#pytimetk.palette_timetk", "dispname": "-"}, {"name": "pytimetk.plot.theme.palette_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/palette_timetk.html#pytimetk.palette_timetk", "dispname": "pytimetk.palette_timetk"}, {"name": "pytimetk.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/glimpse.html#pytimetk.glimpse", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/glimpse.html#pytimetk.glimpse", "dispname": "pytimetk.glimpse"}, {"name": "pytimetk.parallel_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/parallel_apply.html#pytimetk.parallel_apply", "dispname": "-"}, {"name": "pytimetk.utils.parallel_helpers.parallel_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/parallel_apply.html#pytimetk.parallel_apply", "dispname": "pytimetk.parallel_apply"}, {"name": "pytimetk.progress_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/progress_apply.html#pytimetk.progress_apply", "dispname": "-"}, {"name": "pytimetk.utils.parallel_helpers.progress_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/progress_apply.html#pytimetk.progress_apply", "dispname": "pytimetk.progress_apply"}, {"name": "pytimetk.drop_zero_variance", "domain": "py", "role": "function", "priority": "1", "uri": "reference/drop_zero_variance.html#pytimetk.drop_zero_variance", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.drop_zero_variance", "domain": "py", "role": "function", "priority": "1", "uri": "reference/drop_zero_variance.html#pytimetk.drop_zero_variance", "dispname": "pytimetk.drop_zero_variance"}, {"name": "pytimetk.transform_columns", "domain": "py", "role": "function", "priority": "1", "uri": "reference/transform_columns.html#pytimetk.transform_columns", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.transform_columns", "domain": "py", "role": "function", "priority": "1", "uri": "reference/transform_columns.html#pytimetk.transform_columns", "dispname": "pytimetk.transform_columns"}, {"name": "pytimetk.flatten_multiindex_column_names", "domain": "py", "role": "function", "priority": "1", "uri": "reference/flatten_multiindex_column_names.html#pytimetk.flatten_multiindex_column_names", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.flatten_multiindex_column_names", "domain": "py", "role": "function", "priority": "1", "uri": "reference/flatten_multiindex_column_names.html#pytimetk.flatten_multiindex_column_names", "dispname": "pytimetk.flatten_multiindex_column_names"}, {"name": "pytimetk.get_available_datasets", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_available_datasets.html#pytimetk.get_available_datasets", "dispname": "-"}, {"name": "pytimetk.datasets.get_datasets.get_available_datasets", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_available_datasets.html#pytimetk.get_available_datasets", "dispname": "pytimetk.get_available_datasets"}, {"name": "pytimetk.load_dataset", "domain": "py", "role": "function", "priority": "1", "uri": "reference/load_dataset.html#pytimetk.load_dataset", "dispname": "-"}, {"name": "pytimetk.datasets.get_datasets.load_dataset", "domain": "py", "role": "function", "priority": "1", "uri": "reference/load_dataset.html#pytimetk.load_dataset", "dispname": "pytimetk.load_dataset"}]} \ No newline at end of file +{"project": "pytimetk", "version": "0.0.9999", "count": 142, "items": [{"name": "pytimetk.plot_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_timeseries.html#pytimetk.plot_timeseries", "dispname": "-"}, {"name": "pytimetk.plot.plot_timeseries.plot_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_timeseries.html#pytimetk.plot_timeseries", "dispname": "pytimetk.plot_timeseries"}, {"name": "pytimetk.summarize_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/summarize_by_time.html#pytimetk.summarize_by_time", "dispname": "-"}, {"name": "pytimetk.core.summarize_by_time.summarize_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/summarize_by_time.html#pytimetk.summarize_by_time", "dispname": "pytimetk.summarize_by_time"}, {"name": "pytimetk.apply_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/apply_by_time.html#pytimetk.apply_by_time", "dispname": "-"}, {"name": "pytimetk.core.apply_by_time.apply_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/apply_by_time.html#pytimetk.apply_by_time", "dispname": "pytimetk.apply_by_time"}, {"name": "pytimetk.pad_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/pad_by_time.html#pytimetk.pad_by_time", "dispname": "-"}, {"name": "pytimetk.core.pad.pad_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/pad_by_time.html#pytimetk.pad_by_time", "dispname": "pytimetk.pad_by_time"}, {"name": "pytimetk.filter_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/filter_by_time.html#pytimetk.filter_by_time", "dispname": "-"}, {"name": "pytimetk.core.filter_by_time.filter_by_time", "domain": "py", "role": "function", "priority": "1", "uri": "reference/filter_by_time.html#pytimetk.filter_by_time", "dispname": "pytimetk.filter_by_time"}, {"name": "pytimetk.future_frame", "domain": "py", "role": "function", "priority": "1", "uri": "reference/future_frame.html#pytimetk.future_frame", "dispname": "-"}, {"name": "pytimetk.core.future.future_frame", "domain": "py", "role": "function", "priority": "1", "uri": "reference/future_frame.html#pytimetk.future_frame", "dispname": "pytimetk.future_frame"}, {"name": "pytimetk.anomalize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/anomalize.html#pytimetk.anomalize", "dispname": "-"}, {"name": "pytimetk.core.anomalize.anomalize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/anomalize.html#pytimetk.anomalize", "dispname": "pytimetk.anomalize"}, {"name": "pytimetk.plot_anomalies", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies.html#pytimetk.plot_anomalies", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies.plot_anomalies", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies.html#pytimetk.plot_anomalies", "dispname": "pytimetk.plot_anomalies"}, {"name": "pytimetk.plot_anomalies_decomp", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_decomp.html#pytimetk.plot_anomalies_decomp", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies_decomp.plot_anomalies_decomp", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_decomp.html#pytimetk.plot_anomalies_decomp", "dispname": "pytimetk.plot_anomalies_decomp"}, {"name": "pytimetk.plot_anomalies_cleaned", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_cleaned.html#pytimetk.plot_anomalies_cleaned", "dispname": "-"}, {"name": "pytimetk.plot.plot_anomalies_cleaned.plot_anomalies_cleaned", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_anomalies_cleaned.html#pytimetk.plot_anomalies_cleaned", "dispname": "pytimetk.plot_anomalies_cleaned"}, {"name": "pytimetk.binarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/binarize.html#pytimetk.binarize", "dispname": "-"}, {"name": "pytimetk.core.correlationfunnel.binarize", "domain": "py", "role": "function", "priority": "1", "uri": "reference/binarize.html#pytimetk.binarize", "dispname": "pytimetk.binarize"}, {"name": "pytimetk.correlate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/correlate.html#pytimetk.correlate", "dispname": "-"}, {"name": "pytimetk.core.correlationfunnel.correlate", "domain": "py", "role": "function", "priority": "1", "uri": "reference/correlate.html#pytimetk.correlate", "dispname": "pytimetk.correlate"}, {"name": "pytimetk.plot_correlation_funnel", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_correlation_funnel.html#pytimetk.plot_correlation_funnel", "dispname": "-"}, {"name": "pytimetk.plot.plot_correlation_funnel.plot_correlation_funnel", "domain": "py", "role": "function", "priority": "1", "uri": "reference/plot_correlation_funnel.html#pytimetk.plot_correlation_funnel", "dispname": "pytimetk.plot_correlation_funnel"}, {"name": "pytimetk.augment_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_timeseries_signature.html#pytimetk.augment_timeseries_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.timeseries_signature.augment_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_timeseries_signature.html#pytimetk.augment_timeseries_signature", "dispname": "pytimetk.augment_timeseries_signature"}, {"name": "pytimetk.augment_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_holiday_signature.html#pytimetk.augment_holiday_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.holiday_signature.augment_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_holiday_signature.html#pytimetk.augment_holiday_signature", "dispname": "pytimetk.augment_holiday_signature"}, {"name": "pytimetk.augment_lags", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_lags.html#pytimetk.augment_lags", "dispname": "-"}, {"name": "pytimetk.feature_engineering.lags.augment_lags", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_lags.html#pytimetk.augment_lags", "dispname": "pytimetk.augment_lags"}, {"name": "pytimetk.augment_leads", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_leads.html#pytimetk.augment_leads", "dispname": "-"}, {"name": "pytimetk.feature_engineering.leads.augment_leads", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_leads.html#pytimetk.augment_leads", "dispname": "pytimetk.augment_leads"}, {"name": "pytimetk.augment_diffs", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_diffs.html#pytimetk.augment_diffs", "dispname": "-"}, {"name": "pytimetk.feature_engineering.diffs.augment_diffs", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_diffs.html#pytimetk.augment_diffs", "dispname": "pytimetk.augment_diffs"}, {"name": "pytimetk.augment_pct_change", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_pct_change.html#pytimetk.augment_pct_change", "dispname": "-"}, {"name": "pytimetk.feature_engineering.pct_change.augment_pct_change", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_pct_change.html#pytimetk.augment_pct_change", "dispname": "pytimetk.augment_pct_change"}, {"name": "pytimetk.augment_rolling", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling.html#pytimetk.augment_rolling", "dispname": "-"}, {"name": "pytimetk.feature_engineering.rolling.augment_rolling", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling.html#pytimetk.augment_rolling", "dispname": "pytimetk.augment_rolling"}, {"name": "pytimetk.augment_rolling_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling_apply.html#pytimetk.augment_rolling_apply", "dispname": "-"}, {"name": "pytimetk.feature_engineering.rolling_apply.augment_rolling_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rolling_apply.html#pytimetk.augment_rolling_apply", "dispname": "pytimetk.augment_rolling_apply"}, {"name": "pytimetk.augment_expanding", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding.html#pytimetk.augment_expanding", "dispname": "-"}, {"name": "pytimetk.feature_engineering.expanding.augment_expanding", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding.html#pytimetk.augment_expanding", "dispname": "pytimetk.augment_expanding"}, {"name": "pytimetk.augment_expanding_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding_apply.html#pytimetk.augment_expanding_apply", "dispname": "-"}, {"name": "pytimetk.feature_engineering.expanding_apply.augment_expanding_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_expanding_apply.html#pytimetk.augment_expanding_apply", "dispname": "pytimetk.augment_expanding_apply"}, {"name": "pytimetk.augment_ewm", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ewm.html#pytimetk.augment_ewm", "dispname": "-"}, {"name": "pytimetk.feature_engineering.ewm.augment_ewm", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ewm.html#pytimetk.augment_ewm", "dispname": "pytimetk.augment_ewm"}, {"name": "pytimetk.augment_fourier", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_fourier.html#pytimetk.augment_fourier", "dispname": "-"}, {"name": "pytimetk.feature_engineering.fourier.augment_fourier", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_fourier.html#pytimetk.augment_fourier", "dispname": "pytimetk.augment_fourier"}, {"name": "pytimetk.augment_hilbert", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_hilbert.html#pytimetk.augment_hilbert", "dispname": "-"}, {"name": "pytimetk.feature_engineering.hilbert.augment_hilbert", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_hilbert.html#pytimetk.augment_hilbert", "dispname": "pytimetk.augment_hilbert"}, {"name": "pytimetk.augment_wavelet", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_wavelet.html#pytimetk.augment_wavelet", "dispname": "-"}, {"name": "pytimetk.feature_engineering.wavelet.augment_wavelet", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_wavelet.html#pytimetk.augment_wavelet", "dispname": "pytimetk.augment_wavelet"}, {"name": "pytimetk.ts_features", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_features.html#pytimetk.ts_features", "dispname": "-"}, {"name": "pytimetk.core.ts_features.ts_features", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_features.html#pytimetk.ts_features", "dispname": "pytimetk.ts_features"}, {"name": "pytimetk.ts_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_summary.html#pytimetk.ts_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.ts_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ts_summary.html#pytimetk.ts_summary", "dispname": "pytimetk.ts_summary"}, {"name": "pytimetk.TimeSeriesCV.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.glimpse", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.glimpse", "dispname": "pytimetk.TimeSeriesCV.glimpse"}, {"name": "pytimetk.TimeSeriesCV.plot", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.plot", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.plot", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.plot", "dispname": "pytimetk.TimeSeriesCV.plot"}, {"name": "pytimetk.TimeSeriesCV.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.split", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV.split", "dispname": "pytimetk.TimeSeriesCV.split"}, {"name": "pytimetk.TimeSeriesCV", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCV", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCV.html#pytimetk.TimeSeriesCV", "dispname": "pytimetk.TimeSeriesCV"}, {"name": "pytimetk.TimeSeriesCVSplitter.get_n_splits", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter.get_n_splits", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCVSplitter.get_n_splits", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter.get_n_splits", "dispname": "pytimetk.TimeSeriesCVSplitter.get_n_splits"}, {"name": "pytimetk.TimeSeriesCVSplitter.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter.split", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCVSplitter.split", "domain": "py", "role": "function", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter.split", "dispname": "pytimetk.TimeSeriesCVSplitter.split"}, {"name": "pytimetk.TimeSeriesCVSplitter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter", "dispname": "-"}, {"name": "pytimetk.crossvalidation.time_series_cv.TimeSeriesCVSplitter", "domain": "py", "role": "class", "priority": "1", "uri": "reference/TimeSeriesCVSplitter.html#pytimetk.TimeSeriesCVSplitter", "dispname": "pytimetk.TimeSeriesCVSplitter"}, {"name": "pytimetk.augment_macd", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_macd.html#pytimetk.augment_macd", "dispname": "-"}, {"name": "pytimetk.finance.macd.augment_macd", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_macd.html#pytimetk.augment_macd", "dispname": "pytimetk.augment_macd"}, {"name": "pytimetk.augment_ppo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ppo.html#pytimetk.augment_ppo", "dispname": "-"}, {"name": "pytimetk.finance.ppo.augment_ppo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_ppo.html#pytimetk.augment_ppo", "dispname": "pytimetk.augment_ppo"}, {"name": "pytimetk.augment_rsi", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rsi.html#pytimetk.augment_rsi", "dispname": "-"}, {"name": "pytimetk.finance.rsi.augment_rsi", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_rsi.html#pytimetk.augment_rsi", "dispname": "pytimetk.augment_rsi"}, {"name": "pytimetk.augment_cmo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_cmo.html#pytimetk.augment_cmo", "dispname": "-"}, {"name": "pytimetk.finance.cmo.augment_cmo", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_cmo.html#pytimetk.augment_cmo", "dispname": "pytimetk.augment_cmo"}, {"name": "pytimetk.augment_roc", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_roc.html#pytimetk.augment_roc", "dispname": "-"}, {"name": "pytimetk.finance.roc.augment_roc", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_roc.html#pytimetk.augment_roc", "dispname": "pytimetk.augment_roc"}, {"name": "pytimetk.augment_qsmomentum", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_qsmomentum.html#pytimetk.augment_qsmomentum", "dispname": "-"}, {"name": "pytimetk.finance.qsmomentum.augment_qsmomentum", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_qsmomentum.html#pytimetk.augment_qsmomentum", "dispname": "pytimetk.augment_qsmomentum"}, {"name": "pytimetk.augment_bbands", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_bbands.html#pytimetk.augment_bbands", "dispname": "-"}, {"name": "pytimetk.finance.bbands.augment_bbands", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_bbands.html#pytimetk.augment_bbands", "dispname": "pytimetk.augment_bbands"}, {"name": "pytimetk.augment_atr", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_atr.html#pytimetk.augment_atr", "dispname": "-"}, {"name": "pytimetk.finance.atr.augment_atr", "domain": "py", "role": "function", "priority": "1", "uri": "reference/augment_atr.html#pytimetk.augment_atr", "dispname": "pytimetk.augment_atr"}, {"name": "pytimetk.make_future_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_future_timeseries.html#pytimetk.make_future_timeseries", "dispname": "-"}, {"name": "pytimetk.core.make_future_timeseries.make_future_timeseries", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_future_timeseries.html#pytimetk.make_future_timeseries", "dispname": "pytimetk.make_future_timeseries"}, {"name": "pytimetk.make_weekday_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekday_sequence.html#pytimetk.make_weekday_sequence", "dispname": "-"}, {"name": "pytimetk.core.make_timeseries_sequence.make_weekday_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekday_sequence.html#pytimetk.make_weekday_sequence", "dispname": "pytimetk.make_weekday_sequence"}, {"name": "pytimetk.make_weekend_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekend_sequence.html#pytimetk.make_weekend_sequence", "dispname": "-"}, {"name": "pytimetk.core.make_timeseries_sequence.make_weekend_sequence", "domain": "py", "role": "function", "priority": "1", "uri": "reference/make_weekend_sequence.html#pytimetk.make_weekend_sequence", "dispname": "pytimetk.make_weekend_sequence"}, {"name": "pytimetk.get_date_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_date_summary.html#pytimetk.get_date_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.get_date_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_date_summary.html#pytimetk.get_date_summary", "dispname": "pytimetk.get_date_summary"}, {"name": "pytimetk.get_frequency_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency_summary.html#pytimetk.get_frequency_summary", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_frequency_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency_summary.html#pytimetk.get_frequency_summary", "dispname": "pytimetk.get_frequency_summary"}, {"name": "pytimetk.get_diff_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_diff_summary.html#pytimetk.get_diff_summary", "dispname": "-"}, {"name": "pytimetk.core.ts_summary.get_diff_summary", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_diff_summary.html#pytimetk.get_diff_summary", "dispname": "pytimetk.get_diff_summary"}, {"name": "pytimetk.get_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency.html#pytimetk.get_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_frequency.html#pytimetk.get_frequency", "dispname": "pytimetk.get_frequency"}, {"name": "pytimetk.get_seasonal_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_seasonal_frequency.html#pytimetk.get_seasonal_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_seasonal_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_seasonal_frequency.html#pytimetk.get_seasonal_frequency", "dispname": "pytimetk.get_seasonal_frequency"}, {"name": "pytimetk.get_trend_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_trend_frequency.html#pytimetk.get_trend_frequency", "dispname": "-"}, {"name": "pytimetk.core.frequency.get_trend_frequency", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_trend_frequency.html#pytimetk.get_trend_frequency", "dispname": "pytimetk.get_trend_frequency"}, {"name": "pytimetk.get_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_timeseries_signature.html#pytimetk.get_timeseries_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.timeseries_signature.get_timeseries_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_timeseries_signature.html#pytimetk.get_timeseries_signature", "dispname": "pytimetk.get_timeseries_signature"}, {"name": "pytimetk.get_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_holiday_signature.html#pytimetk.get_holiday_signature", "dispname": "-"}, {"name": "pytimetk.feature_engineering.holiday_signature.get_holiday_signature", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_holiday_signature.html#pytimetk.get_holiday_signature", "dispname": "pytimetk.get_holiday_signature"}, {"name": "pytimetk.floor_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/floor_date.html#pytimetk.floor_date", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.floor_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/floor_date.html#pytimetk.floor_date", "dispname": "pytimetk.floor_date"}, {"name": "pytimetk.ceil_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ceil_date.html#pytimetk.ceil_date", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.ceil_date", "domain": "py", "role": "function", "priority": "1", "uri": "reference/ceil_date.html#pytimetk.ceil_date", "dispname": "pytimetk.ceil_date"}, {"name": "pytimetk.is_holiday", "domain": "py", "role": "function", "priority": "1", "uri": "reference/is_holiday.html#pytimetk.is_holiday", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.is_holiday", "domain": "py", "role": "function", "priority": "1", "uri": "reference/is_holiday.html#pytimetk.is_holiday", "dispname": "pytimetk.is_holiday"}, {"name": "pytimetk.week_of_month", "domain": "py", "role": "function", "priority": "1", "uri": "reference/week_of_month.html#pytimetk.week_of_month", "dispname": "-"}, {"name": "pytimetk.utils.datetime_helpers.week_of_month", "domain": "py", "role": "function", "priority": "1", "uri": "reference/week_of_month.html#pytimetk.week_of_month", "dispname": "pytimetk.week_of_month"}, {"name": "pytimetk.timeseries_unit_frequency_table", "domain": "py", "role": "function", "priority": "1", "uri": "reference/timeseries_unit_frequency_table.html#pytimetk.timeseries_unit_frequency_table", "dispname": "-"}, {"name": "pytimetk.core.frequency.timeseries_unit_frequency_table", "domain": "py", "role": "function", "priority": "1", "uri": "reference/timeseries_unit_frequency_table.html#pytimetk.timeseries_unit_frequency_table", "dispname": "pytimetk.timeseries_unit_frequency_table"}, {"name": "pytimetk.time_scale_template", "domain": "py", "role": "function", "priority": "1", "uri": "reference/time_scale_template.html#pytimetk.time_scale_template", "dispname": "-"}, {"name": "pytimetk.core.frequency.time_scale_template", "domain": "py", "role": "function", "priority": "1", "uri": "reference/time_scale_template.html#pytimetk.time_scale_template", "dispname": "pytimetk.time_scale_template"}, {"name": "pytimetk.theme_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/theme_timetk.html#pytimetk.theme_timetk", "dispname": "-"}, {"name": "pytimetk.plot.theme.theme_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/theme_timetk.html#pytimetk.theme_timetk", "dispname": "pytimetk.theme_timetk"}, {"name": "pytimetk.palette_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/palette_timetk.html#pytimetk.palette_timetk", "dispname": "-"}, {"name": "pytimetk.plot.theme.palette_timetk", "domain": "py", "role": "function", "priority": "1", "uri": "reference/palette_timetk.html#pytimetk.palette_timetk", "dispname": "pytimetk.palette_timetk"}, {"name": "pytimetk.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/glimpse.html#pytimetk.glimpse", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.glimpse", "domain": "py", "role": "function", "priority": "1", "uri": "reference/glimpse.html#pytimetk.glimpse", "dispname": "pytimetk.glimpse"}, {"name": "pytimetk.parallel_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/parallel_apply.html#pytimetk.parallel_apply", "dispname": "-"}, {"name": "pytimetk.utils.parallel_helpers.parallel_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/parallel_apply.html#pytimetk.parallel_apply", "dispname": "pytimetk.parallel_apply"}, {"name": "pytimetk.progress_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/progress_apply.html#pytimetk.progress_apply", "dispname": "-"}, {"name": "pytimetk.utils.parallel_helpers.progress_apply", "domain": "py", "role": "function", "priority": "1", "uri": "reference/progress_apply.html#pytimetk.progress_apply", "dispname": "pytimetk.progress_apply"}, {"name": "pytimetk.drop_zero_variance", "domain": "py", "role": "function", "priority": "1", "uri": "reference/drop_zero_variance.html#pytimetk.drop_zero_variance", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.drop_zero_variance", "domain": "py", "role": "function", "priority": "1", "uri": "reference/drop_zero_variance.html#pytimetk.drop_zero_variance", "dispname": "pytimetk.drop_zero_variance"}, {"name": "pytimetk.transform_columns", "domain": "py", "role": "function", "priority": "1", "uri": "reference/transform_columns.html#pytimetk.transform_columns", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.transform_columns", "domain": "py", "role": "function", "priority": "1", "uri": "reference/transform_columns.html#pytimetk.transform_columns", "dispname": "pytimetk.transform_columns"}, {"name": "pytimetk.flatten_multiindex_column_names", "domain": "py", "role": "function", "priority": "1", "uri": "reference/flatten_multiindex_column_names.html#pytimetk.flatten_multiindex_column_names", "dispname": "-"}, {"name": "pytimetk.utils.pandas_helpers.flatten_multiindex_column_names", "domain": "py", "role": "function", "priority": "1", "uri": "reference/flatten_multiindex_column_names.html#pytimetk.flatten_multiindex_column_names", "dispname": "pytimetk.flatten_multiindex_column_names"}, {"name": "pytimetk.get_available_datasets", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_available_datasets.html#pytimetk.get_available_datasets", "dispname": "-"}, {"name": "pytimetk.datasets.get_datasets.get_available_datasets", "domain": "py", "role": "function", "priority": "1", "uri": "reference/get_available_datasets.html#pytimetk.get_available_datasets", "dispname": "pytimetk.get_available_datasets"}, {"name": "pytimetk.load_dataset", "domain": "py", "role": "function", "priority": "1", "uri": "reference/load_dataset.html#pytimetk.load_dataset", "dispname": "-"}, {"name": "pytimetk.datasets.get_datasets.load_dataset", "domain": "py", "role": "function", "priority": "1", "uri": "reference/load_dataset.html#pytimetk.load_dataset", "dispname": "pytimetk.load_dataset"}]} \ No newline at end of file diff --git a/docs/reference/TimeSeriesCV.qmd b/docs/reference/TimeSeriesCV.qmd index 02417aca..6c71ef8f 100644 --- a/docs/reference/TimeSeriesCV.qmd +++ b/docs/reference/TimeSeriesCV.qmd @@ -21,14 +21,16 @@ and an optional `split_limit` to return the first `n` slices of time series cros ## Raises: ValueError: - - If `frequency` is not one of "days", "seconds", "microseconds", "milliseconds", "minutes", "hours", - "weeks". - - If `window` is not one of "rolling" or "expanding". - - If `mode` is not one of "forward" or "backward" - - If `train_size`, `forecast_horizon`, `gap` or `stride` are not strictly positive. + +- If `frequency` is not one of "days", "seconds", "microseconds", "milliseconds", "minutes", "hours", +"weeks". +- If `window` is not one of "rolling" or "expanding". +- If `mode` is not one of "forward" or "backward" +- If `train_size`, `forecast_horizon`, `gap` or `stride` are not strictly positive. TypeError: - If `train_size`, `forecast_horizon`, `gap` or `stride` are not of type `int`. + +If `train_size`, `forecast_horizon`, `gap` or `stride` are not of type `int`. ## Examples: diff --git a/docs/reference/TimeSeriesCVSplitter.qmd b/docs/reference/TimeSeriesCVSplitter.qmd new file mode 100644 index 00000000..6e2b05ac --- /dev/null +++ b/docs/reference/TimeSeriesCVSplitter.qmd @@ -0,0 +1,137 @@ +# TimeSeriesCVSplitter { #pytimetk.TimeSeriesCVSplitter } + +`TimeSeriesCVSplitter(self, *, frequency, train_size, forecast_horizon, time_series, gap=0, stride=None, window='rolling', mode='backward', start_dt=None, end_dt=None, split_limit=None)` + +The `TimeSeriesCVSplitter` is a scikit-learn compatible cross-validator using `TimeSeriesCV`. + +This cross-validator generates splits based on time values, making it suitable for time series data. + +## Parameters: + +frequency: str + The frequency of the time series (e.g., "days", "hours"). +train_size: int + Minimum number of time units in the training set. +forecast_horizon: int + Number of time units to forecast in each split. +time_series: pd.Series + A pandas Series or Index representing the time values. +gap: int + Number of time units to skip between training and testing sets. +stride: int + Number of time units to move forward after each split. +window: str + Type of window, either "rolling" or "expanding". +mode: str + Order of split generation, "forward" or "backward". +start_dt: pd.Timestamp + Start date for the time period. +end_dt: pd.Timestamp + End date for the time period. +split_limit: int + Maximum number of splits to generate. If None, all possible splits will be generated. + +## Raises: + +ValueError: + If the input arrays are incompatible in length with the time series. + +## Returns: + +A generator of tuples of arrays containing the training and forecast data. + +## See Also: + +TimeSeriesCV + +## Examples + +``` {python} +import pandas as pd +import numpy as np + +from pytimetk import TimeSeriesCVSplitter + +start_dt = pd.Timestamp(2023, 1, 1) +end_dt = pd.Timestamp(2023, 1, 31) + +time_series = pd.Series(pd.date_range(start_dt, end_dt, freq="D")) +size = len(time_series) + +df = pd.DataFrame(data=np.random.randn(size, 2), columns=["a", "b"]) + +X, y = df[["a", "b"]], df[["a", "b"]].sum(axis=1) + +cv = TimeSeriesCVSplitter( + time_series=time_series, + frequency="days", + train_size=14, + forecast_horizon=7, + gap=0, + stride=1, + window="rolling", +) + +cv +``` + +``` {python} +# Insepct the cross-validation splits +cv.splitter.plot(y, time_series = time_series) +``` + +``` {python} +# Using the TimeSeriesCVSplitter in a scikit-learn CV model + +from sklearn.linear_model import Ridge +from sklearn.model_selection import RandomizedSearchCV + +# Fit and get best estimator +param_grid = { + "alpha": np.linspace(0.1, 2, 10), + "fit_intercept": [True, False], + "positive": [True, False], +} + +random_search_cv = RandomizedSearchCV( + estimator=Ridge(), + param_distributions=param_grid, + cv=cv, + n_jobs=-1, +).fit(X, y) + +random_search_cv.best_estimator_ +``` + +## Methods + +| Name | Description | +| --- | --- | +| [get_n_splits](#pytimetk.TimeSeriesCVSplitter.get_n_splits) | Returns the number of splits. | +| [split](#pytimetk.TimeSeriesCVSplitter.split) | Generates train and test indices for cross-validation. | + +### get_n_splits { #pytimetk.TimeSeriesCVSplitter.get_n_splits } + +`TimeSeriesCVSplitter.get_n_splits(X=None, y=None, groups=None)` + +Returns the number of splits. + +### split { #pytimetk.TimeSeriesCVSplitter.split } + +`TimeSeriesCVSplitter.split(X=None, y=None, groups=None)` + +Generates train and test indices for cross-validation. + +#### Parameters: + +X: + Optional input features (ignored, for compatibility with scikit-learn). +y: + Optional target variable (ignored, for compatibility with scikit-learn). +groups: + Optional group labels (ignored, for compatibility with scikit-learn). + +#### Yields: + +Tuple[np.ndarray, np.ndarray]: + Tuples of train and test indices. \ No newline at end of file diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index beb6bd22..954f2d4a 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -78,6 +78,7 @@ Time series cross validation. | | | | --- | --- | | [TimeSeriesCV](TimeSeriesCV.qmd#pytimetk.TimeSeriesCV) | `TimeSeriesCV` is a subclass of `TimeBasedSplit` with default mode set to 'backward' | +| [TimeSeriesCVSplitter](TimeSeriesCVSplitter.qmd#pytimetk.TimeSeriesCVSplitter) | The `TimeSeriesCVSplitter` is a scikit-learn compatible cross-validator using `TimeSeriesCV`. | ## 💹 Finance Module (Momentum Indicators) diff --git a/src/pytimetk/crossvalidation/time_series_cv.py b/src/pytimetk/crossvalidation/time_series_cv.py index 5f5ed77b..46065473 100644 --- a/src/pytimetk/crossvalidation/time_series_cv.py +++ b/src/pytimetk/crossvalidation/time_series_cv.py @@ -475,6 +475,8 @@ class TimeSeriesCVSplitter(BaseCrossValidator): Start date for the time period. end_dt: pd.Timestamp End date for the time period. + split_limit: int + Maximum number of splits to generate. If None, all possible splits will be generated. Raises: ------- @@ -510,8 +512,8 @@ class TimeSeriesCVSplitter(BaseCrossValidator): cv = TimeSeriesCVSplitter( time_series=time_series, frequency="days", - train_size=7, - forecast_horizon=11, + train_size=14, + forecast_horizon=7, gap=0, stride=1, window="rolling", @@ -520,7 +522,12 @@ class TimeSeriesCVSplitter(BaseCrossValidator): cv ``` - ``` python + ``` {python} + # Insepct the cross-validation splits + cv.splitter.plot(y, time_series = time_series) + ``` + + ``` {python} # Using the TimeSeriesCVSplitter in a scikit-learn CV model from sklearn.linear_model import Ridge @@ -557,6 +564,7 @@ def __init__( mode: str = "backward", start_dt: pd.Timestamp = None, end_dt: pd.Timestamp = None, + split_limit: int = None, ): self.splitter = TimeSeriesCV( frequency=frequency, @@ -566,6 +574,7 @@ def __init__( stride=stride, window=window, mode=mode, + split_limit=split_limit ) self.time_series_ = time_series self.start_dt_ = start_dt @@ -591,6 +600,8 @@ def split( Optional group labels (ignored, for compatibility with scikit-learn). Yields: + ------- + Tuple[np.ndarray, np.ndarray]: Tuples of train and test indices. """ self._validate_split_args(self.size_, X, y, groups) @@ -639,76 +650,5 @@ def _validate_split_args( - - - - - - -# class TimeSeriesCV: -# """Generates tuples of train_idx, test_idx pairs -# Assumes the MultiIndex contains levels 'symbol' and 'date' -# purges overlapping outcomes. Includes a shift for each test set.""" - -# def __init__( -# self, -# n_splits=3, -# train_period_length=126, -# test_period_length=21, -# lookahead=None, -# shift_length=0, # New parameter to specify the shift length -# date_idx='date', -# shuffle=False, -# seed=None, -# ): -# self.n_splits = n_splits -# self.lookahead = lookahead -# self.test_length = test_period_length -# self.train_length = train_period_length -# self.shift_length = shift_length # Store the shift length -# self.shuffle = shuffle -# self.seed = seed -# self.date_idx = date_idx - -# def split(self, X, y=None, groups=None): -# unique_dates = X.index.get_level_values(self.date_idx).unique() -# days = sorted(unique_dates, reverse=True) - -# splits = [] -# for i in range(self.n_splits): -# # Adjust the end index for the test set to include the shift for subsequent splits -# test_end_idx = i * self.test_length + i * self.shift_length -# test_start_idx = test_end_idx + self.test_length -# train_end_idx = test_start_idx + self.lookahead - 1 -# train_start_idx = train_end_idx + self.train_length + self.lookahead - 1 - -# if train_start_idx >= len(days): -# break # Break if the start index goes beyond the available data - -# dates = X.reset_index()[[self.date_idx]] -# train_idx = dates[(dates[self.date_idx] > days[min(train_start_idx, len(days)-1)]) -# & (dates[self.date_idx] <= days[min(train_end_idx, len(days)-1)])].index -# test_idx = dates[(dates[self.date_idx] > days[min(test_start_idx, len(days)-1)]) -# & (dates[self.date_idx] <= days[min(test_end_idx, len(days)-1)])].index - -# if self.shuffle: -# if self.seed is not None: -# np.random.seed(self.seed) - -# train_idx_list = list(train_idx) -# np.random.shuffle(train_idx_list) -# train_idx = np.array(train_idx_list) -# else: -# train_idx = train_idx.to_numpy() - -# test_idx = test_idx.to_numpy() - -# splits.append((train_idx, test_idx)) - -# return splits - -# def get_n_splits(self, X=None, y=None, groups=None): -# """Adjusts the number of splits if there's not enough data for the desired configuration.""" -# return self.n_splits \ No newline at end of file