diff --git a/.Rbuildignore b/.Rbuildignore index 1aa19d3a..1d244201 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,8 @@ ^docs$ ^pkgdown$ ^\.github$ +CODE_OF_CONDUCT.md +SECURITY.md +SUPPORT.md +cran-comments.md +NEWS.md \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 51a2f60c..077c6bb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.0.0.9000 +Version: 0.1.0 Authors@R: c(person(given = "Mike", family = "Tokic", role = c("aut", "cre"), - email = "mitokic@microsoft.com", + email = "mftokic@gmail.com", comment = c(ORCID = "0000-0002-7630-7055")), person(given = "Aadharsh", family = "Kannan", @@ -34,15 +34,17 @@ Imports: doParallel, dplyr, earth, + foreach, generics, glmnet, gtools, hts, kernlab, lightgbm, + lubridate, magrittr, matrixcalc, - methods, + methods, modeltime.ensemble, modeltime.gluonts, modeltime.resample, @@ -74,8 +76,6 @@ Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 Depends: - R (>= 3.6.0), - lubridate, - foreach, + R (>= 3.6.0), modeltime VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index d2007865..46b1f09a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export("%>%") export(arima) export(arima_boost) export(croston) @@ -26,6 +25,10 @@ export(tabnet) export(tbats) export(theta) export(xgboost) +import(modeltime) +importFrom(foreach,"%do%") +importFrom(foreach,"%dopar%") +importFrom(lubridate,"%m+%") importFrom(magrittr,"%>%") importFrom(methods,formalArgs) importFrom(stats,sd) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..8b530d86 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,4 @@ + +# finnts 0.1.0 + +* Initial CRAN Release \ No newline at end of file diff --git a/R/azure_batch_parallel.R b/R/azure_batch_parallel.R index 43a55704..b6315c85 100644 --- a/R/azure_batch_parallel.R +++ b/R/azure_batch_parallel.R @@ -22,7 +22,7 @@ get_fcast_parallel_azure <- function(combo_list, cli::cli_h2("Submitting Tasks to Azure Batch") - fcst <- foreach(i = combo_list, .combine = 'rbind', + fcst <- foreach::foreach(i = combo_list, .combine = 'rbind', .packages = get_export_packages(), .export = get_transfer_functions(), .options.azure = list(maxTaskRetryCount = 0, diff --git a/R/forecast_models.R b/R/forecast_models.R index 465a1b1d..84cf35d4 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -332,7 +332,7 @@ construct_forecast_models <- function(full_data_tbl, combined_models_recipe_2 <- modeltime::modeltime_table() # parallel processing - if(run_model_parallel==TRUE & sum(parallel_processing!="local_machine") == 1) { + if(run_model_parallel == TRUE & sum(parallel_processing == "local_machine") == 0) { parallel_args <- init_parallel_within(parallel_processing, num_cores) } @@ -390,7 +390,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, mdl_called, location = "top") %>% - update_model_description(1, model_name), + modeltime::update_model_description(1, model_name), silent = TRUE) }else{ @@ -422,7 +422,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, mdl_called, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } @@ -446,7 +446,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_2 <- modeltime::add_modeltime_model(combined_models_recipe_2, mdl_called, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } @@ -651,7 +651,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_ensemble_models <- modeltime::add_modeltime_model(combined_ensemble_models, mdl_ensemble, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index fe5667bd..7b0f6361 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -19,10 +19,10 @@ #' that specified date. Default of NULL is to not remove any time series and attempt to forecast all of them. #' @param fiscal_year_start Month number of start of fiscal year of input data, aids in building out date features. #' Formatted as a numeric value. Default of 1 assumes fiscal year starts in January. -#' @param clean_missing_values Should missing values be inputted? Only inputes values for missing data within an +#' @param clean_missing_values If TRUE, cleans missing values. Only inputes values for missing data within an #' existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said #' values. -#' @param clean_outliers Should outliers be cleaned and inputted with values more in line with historical data? +#' @param clean_outliers If TRUE, outliers are cleaned and inputted with values more in line with historical data #' @param back_test_scenarios Number of specific back test folds to run when determining the best model. #' Default of NULL will automatically choose the number of back tests to run based on historical data size, #' which tries to always use a minimum of 80% of the data when training a model. @@ -36,16 +36,16 @@ #' @param parallel_processing Default of NULL runs no parallel processing and forecasts each individual time series #' one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' #' runs time series in parallel on a remote compute cluster in Azure Batch. -#' @param run_model_parallel Run model training in parallel, only works when parallel_processing is set to +#' @param run_model_parallel If TRUE, runs model training in parallel, only works when parallel_processing is set to #' 'local_machine' or 'azure_batch'. #' @param num_cores Number of cores to run when parallel processing is set up. Used when running parallel computations #' on local machine or within Azure. Default of NULL uses total amount of cores on machine minus one. Can't be greater #' than number of cores on machine minus 1. #' @param azure_batch_credentials Credentials to run parallel_processing in Azure Batch. #' @param azure_batch_cluster_config Compute cluster specification to run parallel_processing in Azure Batch. -#' @param azure_batch_cluster_delete Delete the Azure Batch compute cluster after Finn finished running. -#' @param target_log_transformation Log transform target variable before training models. -#' @param negative_fcst Allow forecasts to dip below zero. +#' @param azure_batch_cluster_delete If TRUE, deletes the Azure Batch compute cluster after Finn finished running. +#' @param target_log_transformation If TRUE, log transform target variable before training models. +#' @param negative_fcst If TRUE, allow forecasts to dip below zero. #' @param fourier_periods List of values to use in creating fourier series as features. Default of NULL automatically chooses #' these values based on the date_type. #' @param lag_periods List of values to use in creating lag features. Default of NULL automatically chooses these values @@ -55,7 +55,7 @@ #' @param recipes_to_run List of recipes to run on multivariate models that can run different recipes. A value of NULL runs #' all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless #' of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe. -#' @param pca Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +#' @param pca If TRUE, run principle component analysis on any lagged features to speed up model run time. Default of NULL runs #' PCA on day and week date types across all local multivariate models, and also for global models across all date types. #' @param reticulate_environment File path to python environment to use when training gluonts deep learning models. #' Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image @@ -63,16 +63,17 @@ #' @param models_to_run List of models to run. Default of NULL runs all models. #' @param models_not_to_run List of models not to run, overrides values in models_to_run. Default of NULL doesn't turn off #' any model. -#' @param run_deep_learning Run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and +#' @param run_deep_learning If TRUE, run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and #' models_not_to_run. -#' @param run_global_models Run multivariate models on the entire data set (across all time series) as a global model. +#' @param run_global_models If TRUE, run multivariate models on the entire data set (across all time series) as a global model. #' Can be override by models_not_to_run. Default of NULL runs global models for all date types except week and day. -#' @param run_local_models Run models by individual time series as local models. -#' @param run_ensemble_models Run ensemble models -#' @param average_models Create simple averages of individual models. +#' @param run_local_models If TRUE, run models by individual time series as local models. +#' @param run_ensemble_models If TRUE, run ensemble models. Default of NULL runs ensemble models only for quarter and month +#' date types. +#' @param average_models If TRUE, create simple averages of individual models. #' @param max_model_average Max number of models to average together. Will create model averages for 2 models up until input value #' or max number of models ran. -#' @param weekly_to_daily Convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating +#' @param weekly_to_daily If TRUE, convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating #' up to higher temporal levels like month or quarter. #' #' @return A list of three separate data sets: the future forecast, the back test results, and the best model per time series. @@ -125,7 +126,7 @@ forecast_time_series <- function(input_data, run_deep_learning = FALSE, run_global_models = NULL, run_local_models = TRUE, - run_ensemble_models = TRUE, + run_ensemble_models = NULL, average_models = TRUE, max_model_average = 3, weekly_to_daily = TRUE @@ -195,12 +196,20 @@ forecast_time_series <- function(input_data, back_test_spacing <- get_back_test_spacing(back_test_spacing, date_type) - # * Yearly Forecast Adjustment ---- - if(date_type =="year") { + # * Ensemble Models Adjustment ---- + if(is.null(run_ensemble_models) & date_type %in% c("quarter", "month")) { + run_ensemble_models <- TRUE + } else if(is.null(run_ensemble_models) & date_type %in% c("week", "day")) { + run_ensemble_models <- FALSE + } else if(sum(run_ensemble_models == TRUE) == 1 & date_type %in% c("quarter", "month", "week", "day")) { + run_ensemble_models <- TRUE + } else if(sum(run_ensemble_models == TRUE) == 1 & date_type =="year") { run_ensemble_models = FALSE warning("ensemble models have been turned off for yearly forecasts") + } else { + run_ensemble_models = FALSE } - + # 4. Prep Data ---- cli::cli_h1("Prepping Data") @@ -354,17 +363,16 @@ forecast_time_series <- function(input_data, model_combinations$All <- model_combinations %>% tidyr::unite(All, colnames(model_combinations)) model_combinations <- model_combinations$All - #parallel processing if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0) { - + cores <- get_cores(num_cores) cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch if(sum(parallel_processing=="azure_batch") == 1) { - clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) } combinations_tbl <- foreach::foreach(i = model_combinations[[1]], .combine = 'rbind', @@ -422,7 +430,7 @@ forecast_time_series <- function(input_data, cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) - combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', + combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', .packages = get_export_packages(), .export = c("fcst_prep", "get_cores")) %dopar% {create_model_averages(i)} @@ -434,7 +442,7 @@ forecast_time_series <- function(input_data, if(sum(parallel_processing=="azure_batch") == 1) { - combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', + combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', .packages = get_export_packages(), .export = c("fcst_prep", "get_cores"), .options.azure = list(maxTaskRetryCount = 0, autoDeleteJob = TRUE, @@ -445,7 +453,7 @@ forecast_time_series <- function(input_data, } if(sum(parallel_processing == 'azure_batch') == 1 & azure_batch_cluster_delete == TRUE) { - stopCluster(cluster) + parallel::stopCluster(cluster) } # combine with individual model data @@ -653,7 +661,8 @@ forecast_time_series <- function(input_data, dplyr::group_by(Combo, .id, Model) %>% dplyr::mutate(Horizon = dplyr::row_number()) %>% dplyr::ungroup() %>% - dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE) %>% + dplyr::mutate(Best_Model = ifelse(Model == "Best-Model", "Yes", "No")) %>% + dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE, Best_Model) %>% tidyr::separate(Combo, into = combo_variables, sep = '--', remove = FALSE) %>% dplyr::rename(Back_Test_Scenario = .id) diff --git a/R/general_parallel.R b/R/general_parallel.R index 65174b94..ea27573a 100644 --- a/R/general_parallel.R +++ b/R/general_parallel.R @@ -28,7 +28,7 @@ init_parallel_within <-function(type, num_cores){ doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch - if(type == "azure_batch") { + if(sum(type == "azure_batch") == 1) { parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) } @@ -72,7 +72,7 @@ get_fcast_parallel<- function(combo_list, cli::cli_alert_info("Running across {cores} cores") - fcst <- foreach(i = combo_list, + fcst <- foreach::foreach(i = combo_list, .combine = 'rbind', .packages = get_export_packages(), .export = get_transfer_functions() diff --git a/R/utility.R b/R/utility.R index 6a57b066..c61a019d 100644 --- a/R/utility.R +++ b/R/utility.R @@ -1,6 +1,14 @@ +# define global variables to prevent notes in R CMD Check +utils::globalVariables(c(".id", ".key", ".model_desc", ".pred", ".resample_id", "All", "Best_Model", "Combo", + "Combo_Test_Date", "Combo_Total", "Count", "Date", "Date_Adj", "Date_Adj_half", + "Date_Adj_index.num", "Date_Adj_quarter", "Date_Adj_year", "Date_Day", "FCST", + "Horizon", "MAPE", "Model", "Number", "Number_Char", "Origin", "Residual", + "Residual_Std_Dev", "Rolling_MAPE", "Slice", "Sum", "Target", "Type", "Variable", + "cluster", "frequency", "gluon_ts_frequency", "hi.80", "hi.95", "i", "lo.80", "lo.95", + "weighted_MAPE", "where", "as2")) + #' @importFrom magrittr %>% -#' @export -magrittr::`%>%` +NULL #' @importFrom methods formalArgs NULL @@ -8,6 +16,15 @@ NULL #' @importFrom stats sd NULL +#' @importFrom foreach %do% %dopar% +NULL + +#' @importFrom lubridate %m+% +NULL + +#' @import modeltime +NULL + # * cbind.fill custom function ---- #create function to cbind dataframes that contain different amounts of rows #https://github.com/cvarrichio/rowr/blob/master/R/rowr.R diff --git a/_pkgdown.yml b/_pkgdown.yml index 42657634..59e0a626 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,3 +8,7 @@ authors: href: https://aadharshkannan.com/ Mike Tokic: href: https://www.linkedin.com/in/michaeltokic/ + +template: + params: + ganalytics: G-6X0DS5856B diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 00000000..4dac61b2 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,19 @@ +## R CMD check results +There were no ERRORs or WARNINGs. + +There was 1 NOTE: + +* checking dependencies in R code ... NOTE + Imports includes 38 non-default packages. + Importing from so many packages makes the package vulnerable to any of + them becoming unavailable. Move as many as possible to Suggests and + use conditionally. + + This package does leverage many outside packages. The main feature of this package is + that it consolidates a lot of different models into one package to run them automatically. + So having many required packages is important to the package. + + Also this is my first cran submission. + +## Downstream dependencies +There are currently no downstream dependencies for this package \ No newline at end of file diff --git a/man/forecast_time_series.Rd b/man/forecast_time_series.Rd index 08ae1498..a31bf6eb 100644 --- a/man/forecast_time_series.Rd +++ b/man/forecast_time_series.Rd @@ -41,7 +41,7 @@ forecast_time_series( run_deep_learning = FALSE, run_global_models = NULL, run_local_models = TRUE, - run_ensemble_models = TRUE, + run_ensemble_models = NULL, average_models = TRUE, max_model_average = 3, weekly_to_daily = TRUE @@ -76,11 +76,11 @@ that specified date. Default of NULL is to not remove any time series and attemp \item{fiscal_year_start}{Month number of start of fiscal year of input data, aids in building out date features. Formatted as a numeric value. Default of 1 assumes fiscal year starts in January.} -\item{clean_missing_values}{Should missing values be inputted? Only inputes values for missing data within an +\item{clean_missing_values}{If TRUE, cleans missing values. Only inputes values for missing data within an existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said values.} -\item{clean_outliers}{Should outliers be cleaned and inputted with values more in line with historical data?} +\item{clean_outliers}{If TRUE, outliers are cleaned and inputted with values more in line with historical data} \item{back_test_scenarios}{Number of specific back test folds to run when determining the best model. Default of NULL will automatically choose the number of back tests to run based on historical data size, @@ -100,7 +100,7 @@ a more traditional hierarchical time series to forecast, both based on the hts p one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' runs time series in parallel on a remote compute cluster in Azure Batch.} -\item{run_model_parallel}{Run model training in parallel, only works when parallel_processing is set to +\item{run_model_parallel}{If TRUE, runs model training in parallel, only works when parallel_processing is set to 'local_machine' or 'azure_batch'.} \item{num_cores}{Number of cores to run when parallel processing is set up. Used when running parallel computations @@ -111,11 +111,11 @@ than number of cores on machine minus 1.} \item{azure_batch_cluster_config}{Compute cluster specification to run parallel_processing in Azure Batch.} -\item{azure_batch_cluster_delete}{Delete the Azure Batch compute cluster after Finn finished running.} +\item{azure_batch_cluster_delete}{If TRUE, deletes the Azure Batch compute cluster after Finn finished running.} -\item{target_log_transformation}{Log transform target variable before training models.} +\item{target_log_transformation}{If TRUE, log transform target variable before training models.} -\item{negative_fcst}{Allow forecasts to dip below zero.} +\item{negative_fcst}{If TRUE, allow forecasts to dip below zero.} \item{fourier_periods}{List of values to use in creating fourier series as features. Default of NULL automatically chooses these values based on the date_type.} @@ -130,7 +130,7 @@ chooses these values based on date type.} all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe.} -\item{pca}{Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +\item{pca}{If TRUE, run principle component analysis on any lagged features to speed up model run time. Default of NULL runs PCA on day and week date types across all local multivariate models, and also for global models across all date types.} \item{reticulate_environment}{File path to python environment to use when training gluonts deep learning models. @@ -142,22 +142,23 @@ that has python environment already installed.} \item{models_not_to_run}{List of models not to run, overrides values in models_to_run. Default of NULL doesn't turn off any model.} -\item{run_deep_learning}{Run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and +\item{run_deep_learning}{If TRUE, run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and models_not_to_run.} -\item{run_global_models}{Run multivariate models on the entire data set (across all time series) as a global model. +\item{run_global_models}{If TRUE, run multivariate models on the entire data set (across all time series) as a global model. Can be override by models_not_to_run. Default of NULL runs global models for all date types except week and day.} -\item{run_local_models}{Run models by individual time series as local models.} +\item{run_local_models}{If TRUE, run models by individual time series as local models.} -\item{run_ensemble_models}{Run ensemble models} +\item{run_ensemble_models}{If TRUE, run ensemble models. Default of NULL runs ensemble models only for quarter and month +date types.} -\item{average_models}{Create simple averages of individual models.} +\item{average_models}{If TRUE, create simple averages of individual models.} \item{max_model_average}{Max number of models to average together. Will create model averages for 2 models up until input value or max number of models ran.} -\item{weekly_to_daily}{Convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating +\item{weekly_to_daily}{If TRUE, convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating up to higher temporal levels like month or quarter.} } \value{ diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 788fc1ee..00000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{\%>\%} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} -}} - diff --git a/tests/testthat/test-forecast_time_series.R b/tests/testthat/test-forecast_time_series.R index d6e1b94e..272ce970 100644 --- a/tests/testthat/test-forecast_time_series.R +++ b/tests/testthat/test-forecast_time_series.R @@ -14,7 +14,7 @@ forecast_horizon <- 3 target_variable <- "value" combo_variables <- c("id") models_to_run <- c("arima", "ets") -inp_data <- m750 %>% dplyr::rename(Date = date) %>% dplyr::mutate(id = as.character(id)) +inp_data <- modeltime::m750 %>% dplyr::rename(Date = date) %>% dplyr::mutate(id = as.character(id)) dt_type <- "month" finn_forecast <- forecast_time_series(