final push before PR

microsoft · Nov 18, 2021 · 9fb7c2e · 9fb7c2e
1 parent 0c7a314
commit 9fb7c2e
Show file tree

Hide file tree

Showing 14 changed files with 118 additions and 72 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,8 @@
 ^docs$
 ^pkgdown$
 ^\.github$
+CODE_OF_CONDUCT.md
+SECURITY.md
+SUPPORT.md
+cran-comments.md
+NEWS.md
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: finnts
 Title: Microsoft Finance Time Series Forecasting Framework
-Version: 0.0.0.9000
+Version: 0.1.0
 Authors@R: 
     c(person(given = "Mike",
            family = "Tokic",
            role = c("aut", "cre"),
-           email = "mitokic@microsoft.com", 
+           email = "mftokic@gmail.com", 
            comment = c(ORCID = "0000-0002-7630-7055")), 
       person(given = "Aadharsh",
            family = "Kannan",
@@ -34,15 +34,17 @@ Imports:
     doParallel,
     dplyr,
     earth,
+    foreach,
     generics,
     glmnet,
     gtools,
     hts,
     kernlab,
     lightgbm,
+    lubridate,
     magrittr,
     matrixcalc,
-    methods, 
+    methods,
     modeltime.ensemble,
     modeltime.gluonts,
     modeltime.resample,
@@ -74,8 +76,6 @@ Suggests:
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Depends: 
-    R (>= 3.6.0),
-    lubridate,
-    foreach,
+    R (>= 3.6.0), 
     modeltime
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-export("%>%")
 export(arima)
 export(arima_boost)
 export(croston)
@@ -26,6 +25,10 @@ export(tabnet)
 export(tbats)
 export(theta)
 export(xgboost)
+import(modeltime)
+importFrom(foreach,"%do%")
+importFrom(foreach,"%dopar%")
+importFrom(lubridate,"%m+%")
 importFrom(magrittr,"%>%")
 importFrom(methods,formalArgs)
 importFrom(stats,sd)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,4 @@
+
+# finnts 0.1.0
+
+* Initial CRAN Release
diff --git a/R/azure_batch_parallel.R b/R/azure_batch_parallel.R
@@ -22,7 +22,7 @@ get_fcast_parallel_azure <- function(combo_list,
 
   cli::cli_h2("Submitting Tasks to Azure Batch")
 
-  fcst <- foreach(i = combo_list, .combine = 'rbind',
+  fcst <- foreach::foreach(i = combo_list, .combine = 'rbind',
                   .packages = get_export_packages(), 
                   .export = get_transfer_functions(),
                   .options.azure = list(maxTaskRetryCount = 0, 

diff --git a/R/forecast_models.R b/R/forecast_models.R
@@ -332,7 +332,7 @@ construct_forecast_models <- function(full_data_tbl,
     combined_models_recipe_2 <- modeltime::modeltime_table()
 
     # parallel processing
-    if(run_model_parallel==TRUE & sum(parallel_processing!="local_machine") == 1) {
+    if(run_model_parallel == TRUE & sum(parallel_processing == "local_machine") == 0) {
       parallel_args <- init_parallel_within(parallel_processing, num_cores)
     }
 
@@ -390,7 +390,7 @@ construct_forecast_models <- function(full_data_tbl,
         try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1,
                                                                        mdl_called,
                                                                        location = "top") %>%
-              update_model_description(1, model_name),
+              modeltime::update_model_description(1, model_name),
             silent = TRUE)
 
       }else{
@@ -422,7 +422,7 @@ construct_forecast_models <- function(full_data_tbl,
             try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1,
                                                                            mdl_called,
                                                                            location = "top") %>%
-                  update_model_description(1, add_name),
+                  modeltime::update_model_description(1, add_name),
                 silent = TRUE)
 
         }
@@ -446,7 +446,7 @@ construct_forecast_models <- function(full_data_tbl,
           try(combined_models_recipe_2 <- modeltime::add_modeltime_model(combined_models_recipe_2,
                                                                          mdl_called,
                                                                          location = "top") %>%
-                update_model_description(1, add_name),
+                modeltime::update_model_description(1, add_name),
               silent = TRUE)
         }
 
@@ -651,7 +651,7 @@ construct_forecast_models <- function(full_data_tbl,
         try(combined_ensemble_models <- modeltime::add_modeltime_model(combined_ensemble_models,
                                                                        mdl_ensemble,
                                                                        location = "top") %>%
-              update_model_description(1, add_name),
+              modeltime::update_model_description(1, add_name),
             silent = TRUE)
       }
 

diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R
@@ -19,10 +19,10 @@
 #'   that specified date. Default of NULL is to not remove any time series and attempt to forecast all of them. 
 #' @param fiscal_year_start Month number of start of fiscal year of input data, aids in building out date features. 
 #'   Formatted as a numeric value. Default of 1 assumes fiscal year starts in January. 
-#' @param clean_missing_values Should missing values be inputted? Only inputes values for missing data within an 
+#' @param clean_missing_values If TRUE, cleans missing values. Only inputes values for missing data within an 
 #'   existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said 
 #'   values. 
-#' @param clean_outliers Should outliers be cleaned and inputted with values more in line with historical data?
+#' @param clean_outliers If TRUE, outliers are cleaned and inputted with values more in line with historical data
 #' @param back_test_scenarios Number of specific back test folds to run when determining the best model. 
 #'   Default of NULL will automatically choose the number of back tests to run based on historical data size, 
 #'   which tries to always use a minimum of 80% of the data when training a model. 
@@ -36,16 +36,16 @@
 #' @param parallel_processing Default of NULL runs no parallel processing and forecasts each individual time series
 #'   one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch'
 #'   runs time series in parallel on a remote compute cluster in Azure Batch. 
-#' @param run_model_parallel Run model training in parallel, only works when parallel_processing is set to 
+#' @param run_model_parallel If TRUE, runs model training in parallel, only works when parallel_processing is set to 
 #'   'local_machine' or 'azure_batch'.
 #' @param num_cores Number of cores to run when parallel processing is set up. Used when running parallel computations 
 #'   on local machine or within Azure. Default of NULL uses total amount of cores on machine minus one. Can't be greater 
 #'   than number of cores on machine minus 1.
 #' @param azure_batch_credentials Credentials to run parallel_processing in Azure Batch.
 #' @param azure_batch_cluster_config Compute cluster specification to run parallel_processing in Azure Batch.
-#' @param azure_batch_cluster_delete Delete the Azure Batch compute cluster after Finn finished running. 
-#' @param target_log_transformation Log transform target variable before training models. 
-#' @param negative_fcst Allow forecasts to dip below zero. 
+#' @param azure_batch_cluster_delete If TRUE, deletes the Azure Batch compute cluster after Finn finished running. 
+#' @param target_log_transformation If TRUE, log transform target variable before training models. 
+#' @param negative_fcst If TRUE, allow forecasts to dip below zero. 
 #' @param fourier_periods List of values to use in creating fourier series as features. Default of NULL automatically chooses 
 #'   these values based on the date_type. 
 #' @param lag_periods List of values to use in creating lag features. Default of NULL automatically chooses these values 
@@ -55,24 +55,25 @@
 #' @param recipes_to_run List of recipes to run on multivariate models that can run different recipes. A value of NULL runs 
 #'   all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless 
 #'   of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe.  
-#' @param pca Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs
+#' @param pca If TRUE, run principle component analysis on any lagged features to speed up model run time. Default of NULL runs
 #'   PCA on day and week date types across all local multivariate models, and also for global models across all date types. 
 #' @param reticulate_environment File path to python environment to use when training gluonts deep learning models. 
 #'   Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image 
 #'   that has python environment already installed. 
 #' @param models_to_run List of models to run. Default of NULL runs all models. 
 #' @param models_not_to_run List of models not to run, overrides values in models_to_run. Default of NULL doesn't turn off 
 #'   any model. 
-#' @param run_deep_learning Run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and 
+#' @param run_deep_learning If TRUE, run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and 
 #'  models_not_to_run. 
-#' @param run_global_models Run multivariate models on the entire data set (across all time series) as a global model. 
+#' @param run_global_models If TRUE, run multivariate models on the entire data set (across all time series) as a global model. 
 #'   Can be override by models_not_to_run. Default of NULL runs global models for all date types except week and day. 
-#' @param run_local_models Run models by individual time series as local models.
-#' @param run_ensemble_models Run ensemble models 
-#' @param average_models Create simple averages of individual models. 
+#' @param run_local_models If TRUE, run models by individual time series as local models.
+#' @param run_ensemble_models If TRUE, run ensemble models. Default of NULL runs ensemble models only for quarter and month
+#'   date types.  
+#' @param average_models If TRUE, create simple averages of individual models. 
 #' @param max_model_average Max number of models to average together. Will create model averages for 2 models up until input value 
 #'   or max number of models ran.
-#' @param weekly_to_daily Convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating 
+#' @param weekly_to_daily If TRUE, convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating 
 #'   up to higher temporal levels like month or quarter. 
 #' 
 #' @return A list of three separate data sets: the future forecast, the back test results, and the best model per time series.
@@ -125,7 +126,7 @@ forecast_time_series <- function(input_data,
   run_deep_learning = FALSE, 
   run_global_models = NULL,
   run_local_models = TRUE,
-  run_ensemble_models = TRUE,
+  run_ensemble_models = NULL,
   average_models = TRUE,
   max_model_average = 3,
   weekly_to_daily = TRUE
@@ -195,12 +196,20 @@ forecast_time_series <- function(input_data,
   back_test_spacing <- get_back_test_spacing(back_test_spacing,
                                              date_type)
 
-  # * Yearly Forecast Adjustment ----
-  if(date_type =="year") {
+  # * Ensemble Models Adjustment ----
+  if(is.null(run_ensemble_models) & date_type %in% c("quarter", "month")) {
+    run_ensemble_models <- TRUE
+  } else if(is.null(run_ensemble_models) & date_type %in% c("week", "day")) {
+    run_ensemble_models <- FALSE
+  } else if(sum(run_ensemble_models == TRUE) == 1 & date_type %in% c("quarter", "month", "week", "day")) {
+    run_ensemble_models <- TRUE
+  } else if(sum(run_ensemble_models == TRUE) == 1 & date_type =="year") {
     run_ensemble_models = FALSE
     warning("ensemble models have been turned off for yearly forecasts")
+  } else {
+    run_ensemble_models = FALSE
   }
-  
+
   # 4. Prep Data ----
 
   cli::cli_h1("Prepping Data")
@@ -354,17 +363,16 @@ forecast_time_series <- function(input_data,
       model_combinations$All <- model_combinations %>% tidyr::unite(All, colnames(model_combinations))
       model_combinations <- model_combinations$All
 
-
       #parallel processing
       if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0) {
-        
+
         cores <- get_cores(num_cores)
         cl <- parallel::makeCluster(cores)
         doParallel::registerDoParallel(cl)
 
         #point to the correct libraries within Azure Batch
         if(sum(parallel_processing=="azure_batch") == 1) {
-          clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages"))
+          parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages"))
         }
 
         combinations_tbl <-  foreach::foreach(i = model_combinations[[1]], .combine = 'rbind', 
@@ -422,7 +430,7 @@ forecast_time_series <- function(input_data,
       cl <- parallel::makeCluster(cores)
       doParallel::registerDoParallel(cl)
 
-      combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind',
+      combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind',
                                         .packages = get_export_packages(), 
                                         .export = c("fcst_prep", "get_cores")) %dopar% {create_model_averages(i)}
 
@@ -434,7 +442,7 @@ forecast_time_series <- function(input_data,
     if(sum(parallel_processing=="azure_batch") == 1) {
 
 
-      combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind',
+      combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind',
                                         .packages = get_export_packages(), 
                                         .export = c("fcst_prep", "get_cores"),
                                         .options.azure = list(maxTaskRetryCount = 0, autoDeleteJob = TRUE, 
@@ -445,7 +453,7 @@ forecast_time_series <- function(input_data,
     }
 
     if(sum(parallel_processing == 'azure_batch') == 1 & azure_batch_cluster_delete == TRUE) {
-      stopCluster(cluster)
+      parallel::stopCluster(cluster)
     }
 
     # combine with individual model data
@@ -653,7 +661,8 @@ forecast_time_series <- function(input_data,
       dplyr::group_by(Combo, .id, Model) %>%
       dplyr::mutate(Horizon = dplyr::row_number()) %>%
       dplyr::ungroup() %>%
-      dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE) %>%
+      dplyr::mutate(Best_Model = ifelse(Model == "Best-Model", "Yes", "No")) %>%
+      dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE, Best_Model) %>%
       tidyr::separate(Combo, into = combo_variables, sep = '--', remove = FALSE) %>%
       dplyr::rename(Back_Test_Scenario = .id)
 

diff --git a/R/general_parallel.R b/R/general_parallel.R
@@ -28,7 +28,7 @@ init_parallel_within <-function(type, num_cores){
   doParallel::registerDoParallel(cl)
 
   #point to the correct libraries within Azure Batch
-  if(type == "azure_batch") {
+  if(sum(type == "azure_batch") == 1) {
     parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) 
   }
 
@@ -72,7 +72,7 @@ get_fcast_parallel<- function(combo_list,
 
   cli::cli_alert_info("Running across {cores} cores")
 
-  fcst <- foreach(i = combo_list, 
+  fcst <- foreach::foreach(i = combo_list, 
                   .combine = 'rbind',
                   .packages = get_export_packages(),
                   .export = get_transfer_functions()

diff --git a/R/utility.R b/R/utility.R
@@ -1,13 +1,30 @@
+# define global variables to prevent notes in R CMD Check
+utils::globalVariables(c(".id", ".key", ".model_desc", ".pred", ".resample_id", "All", "Best_Model", "Combo",
+                  "Combo_Test_Date", "Combo_Total", "Count", "Date", "Date_Adj", "Date_Adj_half",
+                  "Date_Adj_index.num", "Date_Adj_quarter", "Date_Adj_year", "Date_Day", "FCST",
+                  "Horizon", "MAPE", "Model", "Number", "Number_Char", "Origin", "Residual",
+                  "Residual_Std_Dev", "Rolling_MAPE", "Slice", "Sum", "Target", "Type", "Variable", 
+                  "cluster", "frequency", "gluon_ts_frequency", "hi.80", "hi.95", "i", "lo.80", "lo.95",
+                  "weighted_MAPE", "where", "as2"))
+
 #' @importFrom magrittr %>%
-#' @export
-magrittr::`%>%`
+NULL
 
 #' @importFrom methods formalArgs
 NULL
 
 #' @importFrom stats sd
 NULL
 
+#' @importFrom foreach %do% %dopar%
+NULL
+
+#' @importFrom lubridate %m+%
+NULL
+
+#' @import modeltime
+NULL
+
 # * cbind.fill custom function ----
 #create function to cbind dataframes that contain different amounts of rows
 #https://github.com/cvarrichio/rowr/blob/master/R/rowr.R

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -8,3 +8,7 @@ authors:
         href: https://aadharshkannan.com/
     Mike Tokic:
         href: https://www.linkedin.com/in/michaeltokic/
+
+template:
+  params:
+    ganalytics: G-6X0DS5856B
diff --git a/cran-comments.md b/cran-comments.md
@@ -0,0 +1,19 @@
+## R CMD check results
+There were no ERRORs or WARNINGs. 
+
+There was 1 NOTE:
+
+* checking dependencies in R code ... NOTE
+  Imports includes 38 non-default packages.
+  Importing from so many packages makes the package vulnerable to any of
+  them becoming unavailable.  Move as many as possible to Suggests and
+  use conditionally.
+
+  This package does leverage many outside packages. The main feature of this package is 
+  that it consolidates a lot of different models into one package to run them automatically. 
+  So having many required packages is important to the package. 
+
+  Also this is my first cran submission. 
+
+## Downstream dependencies
+There are currently no downstream dependencies for this package