diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5a66ef8 Binary files /dev/null and b/.DS_Store differ diff --git a/build_package.R b/build_package.R index 4da39cf..627f028 100644 --- a/build_package.R +++ b/build_package.R @@ -6,7 +6,7 @@ xfun::gsub_dir(pattern = "valiData", replacement = "validata") library(pacman) p_load(rstudioapi, devtools, roxygen2, usethis, pkgdown, ymlthis, magrittr, fs, covr, gitcreds, credentials, - badger, hexSticker, gh) + badger, hexSticker, gh, framecleaner, presenter, autostats, validata, tidybins) p_load(TidyConsultant) # add this file to .Rbuildignore ------------------------------------------ @@ -129,6 +129,9 @@ import_tibble("insurance.csv") -> insurance usethis::use_data(insurance) usethis::use_vignette("TidyConsultant") usethis::use_r("data.csv") + +usethis::use_version(which = "patch") +devtools::submit_cran() # build and check --------------------------------------------------------- devtools::document() devtools::build_readme() @@ -141,3 +144,5 @@ devtools::load_all() usethis::use_article(name = "Inference with xgboost") pacman::p_unload("all") + +install.packages("EIX", "DiagrammeR") diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..a7fb7e0 Binary files /dev/null and b/docs/.DS_Store differ diff --git a/docs/404.html b/docs/404.html index 6d7f057..7141802 100644 --- a/docs/404.html +++ b/docs/404.html @@ -6,7 +6,7 @@ Page not found (404) • TidyConsultant - + @@ -38,7 +38,7 @@ TidyConsultant - 0.1.0 + 0.1.1 @@ -112,7 +112,7 @@

Page not found (404)

-

Site built with pkgdown 2.0.6.

+

Site built with pkgdown 2.0.9.

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 9ff0500..71cf768 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -1,5 +1,5 @@ -License • TidyConsultantLicense • TidyConsultantMIT License • TidyConsultant + @@ -39,7 +39,7 @@ TidyConsultant - 0.1.0 + 0.1.1 @@ -87,8 +87,9 @@ - -
+ + +
+#> variables zeros minus infs min mean max `|x|<1 (ratio)` integer_ratio +#> <chr> <int> <int> <int> <dbl> <dbl> <int> <dbl> <dbl> +#> 1 satisfact… 0 0 0 0.09 6.13e-1 1 0.993 0.00740 +#> 2 last_eval… 0 0 0 0.36 7.16e-1 1 0.981 0.0189 +#> 3 number_pr… 0 0 0 2 3.80e+0 7 0 1 +#> 4 average_m… 0 0 0 96 2.01e+2 310 0 1 +#> 5 time_spen… 0 0 0 2 3.50e+0 10 0 1 +#> 6 Work_acci… 12830 0 0 0 1.45e-1 1 0.855 1 +#> 7 left 11428 0 0 0 2.38e-1 1 0.762 1 +#> 8 promotion… 14680 0 0 0 2.13e-2 1 0.979 1 +#> # ℹ 2 more variables: mode <dbl>, mode_ratio <dbl>

xgboost binary classification model @@ -217,21 +220,21 @@

xgboost binary classification model
#> # A tibble: 15 × 3
 #>    .metric              .estimate .formula              
 #>    <chr>                    <dbl> <chr>                 
-#>  1 accuracy                 0.975 TP + TN / total       
-#>  2 kap                      0.930 NA                    
-#>  3 sens                     0.931 TP / actually P       
-#>  4 spec                     0.989 TN / actually N       
-#>  5 ppv                      0.963 TP / predicted P      
-#>  6 npv                      0.978 TN / predicted N      
-#>  7 mcc                      0.930 NA                    
-#>  8 j_index                  0.920 NA                    
-#>  9 bal_accuracy             0.960 sens + spec / 2       
-#> 10 detection_prevalence     0.231 predicted P / total   
-#> 11 precision                0.963 PPV, 1-FDR            
-#> 12 recall                   0.931 sens, TPR             
-#> 13 f_meas                   0.947 HM(ppv, sens)         
-#> 14 baseline_accuracy        0.761 majority class / total
-#> 15 roc_auc                  0.987 NA
+#> 1 accuracy 0.934 TP + TN / total +#> 2 kap 0.806 NA +#> 3 sens 0.780 TP / actually P +#> 4 spec 0.981 TN / actually N +#> 5 ppv 0.928 TP / predicted P +#> 6 npv 0.935 TN / predicted N +#> 7 mcc 0.811 NA +#> 8 j_index 0.762 NA +#> 9 bal_accuracy 0.881 sens + spec / 2 +#> 10 detection_prevalence 0.198 predicted P / total +#> 11 precision 0.928 PPV, 1-FDR +#> 12 recall 0.780 sens, TPR +#> 13 f_meas 0.848 HM(ppv, sens) +#> 14 baseline_accuracy 0.764 majority class / total +#> 15 roc_auc 0.975 NA

This line will save the tree structure of the model as a table.

@@ -239,30 +242,32 @@ 

xgboost binary classification model xgboost::xgb.model.dt.tree(model = .) -> xg_trees xg_trees -#> Tree Node ID Feature Split Yes No Missing Quality -#> 1: 0 0 0-0 satisfaction_level 0.465 0-1 0-2 0-1 3123.25146000 -#> 2: 0 1 0-1 number_project 2.500 0-3 0-4 0-3 892.94714400 -#> 3: 0 2 0-2 time_spend_company 4.500 0-5 0-6 0-5 1284.82617000 -#> 4: 0 3 0-3 Leaf NA <NA> <NA> <NA> 0.45360827 -#> 5: 0 4 0-4 Leaf NA <NA> <NA> <NA> -0.10822086 -#> --- -#> 614: 99 2 99-2 number_project 2.500 99-5 99-6 99-5 28.99409870 -#> 615: 99 3 99-3 Leaf NA <NA> <NA> <NA> 0.07221178 -#> 616: 99 4 99-4 Leaf NA <NA> <NA> <NA> -0.13327244 -#> 617: 99 5 99-5 Leaf NA <NA> <NA> <NA> -0.26394776 -#> 618: 99 6 99-6 Leaf NA <NA> <NA> <NA> 0.05643456 -#> Cover -#> 1: 3749.75000 -#> 2: 1045.75000 -#> 3: 2704.00000 -#> 4: 435.50000 -#> 5: 610.25000 -#> --- -#> 614: 256.00961 -#> 615: 110.06954 -#> 616: 101.18491 -#> 617: 27.59598 -#> 618: 228.41362

+#> Tree Node ID Feature Split Yes No Missing +#> <int> <int> <char> <char> <num> <char> <char> <char> +#> 1: 0 0 0-0 satisfaction_level 0.465 0-1 0-2 0-1 +#> 2: 0 1 0-1 number_project 2.500 0-3 0-4 0-3 +#> 3: 0 2 0-2 last_evaluation 0.825 0-5 0-6 0-5 +#> 4: 0 3 0-3 Leaf NA <NA> <NA> <NA> +#> 5: 0 4 0-4 Leaf NA <NA> <NA> <NA> +#> --- +#> 624: 99 2 99-2 satisfaction_level 0.715 99-5 99-6 99-5 +#> 625: 99 3 99-3 Leaf NA <NA> <NA> <NA> +#> 626: 99 4 99-4 Leaf NA <NA> <NA> <NA> +#> 627: 99 5 99-5 Leaf NA <NA> <NA> <NA> +#> 628: 99 6 99-6 Leaf NA <NA> <NA> <NA> +#> Quality Cover +#> <num> <num> +#> 1: 2292.12305000 2818.5000 +#> 2: 680.09655800 784.0000 +#> 3: 348.69824200 2034.5000 +#> 4: 0.07575618 323.5000 +#> 5: -0.01875271 460.5000 +#> --- +#> 624: 127.58088700 334.8139 +#> 625: 0.01482613 329.4183 +#> 626: -0.02640105 456.9404 +#> 627: -0.01077674 181.7939 +#> 628: 0.05104293 153.0200

Let’s plot the first tree and interpret the table output. For tree=0, the root feature (node=0) is satisfaction level, which is split at value .465. Is satisfaction_level < .465? If Yes, observations go left to @@ -271,13 +276,13 @@

xgboost binary classification model 3123, the improvement in training loss.

 xgboost::xgb.plot.tree(model = xg1, trees = 0)
-
-

The quality in the leaves is the prediction for observations in those +

+

The quality in the leaves is the prediction for observations in those leaves represented by log odds. To interpret them as probabilities, use the function below. Importantly, a log odds of 0 is a 0.5 probability.

-
-

sigmoid curve: logit function

+
+sigmoid curve: logit function
sigmoid curve: logit function
@@ -293,12 +298,12 @@

Analyze interactionst1 %>% group_by(Tree) %>% slice(which(Node == 0)) %>% ungroup %>% select(Tree, Root_Feature = Feature) %>% - bind_cols( + bind_cols( t1 %>% group_by(Tree) %>% slice(which(Node == 1)) %>% ungroup %>% select(Child1 = Feature) ) %>% - bind_cols( + bind_cols( t1 %>% group_by(Tree) %>% slice(which(Node == 2)) %>% ungroup %>% select(Child2 = Feature) @@ -318,23 +323,27 @@

Analyze interactions imps <- EIX::importance(xg1, hr1, option = "interactions") as_tibble(imps) %>% - set_int(where(is.numeric)) -#> # A tibble: 30 × 7 -#> Feature sumGain sumCo…¹ meanG…² meanC…³ frequ…⁴ mean5…⁵ -#> <chr> <int> <int> <int> <int> <int> <int> -#> 1 time_spend_company:satisfact… 1941 4734 485 1184 4 485 -#> 2 average_montly_hours:number_… 1666 5943 333 1189 5 333 -#> 3 last_evaluation:number_proje… 923 3498 461 1749 2 461 -#> 4 satisfaction_level:last_eval… 865 3240 288 1080 3 288 -#> 5 last_evaluation:average_mont… 784 1692 392 846 2 392 -#> 6 satisfaction_level:time_spen… 645 2703 107 450 6 128 -#> 7 average_montly_hours:time_sp… 332 2170 55 361 6 64 -#> 8 last_evaluation:time_spend_c… 262 494 262 494 1 262 -#> 9 average_montly_hours:last_ev… 259 1294 64 323 4 64 -#> 10 average_montly_hours:satisfa… 245 1457 61 364 4 61 -#> # … with 20 more rows, and abbreviated variable names ¹​sumCover, ²​meanGain, -#> # ³​meanCover, ⁴​frequency, ⁵​mean5Gain -#> # ℹ Use `print(n = ...)` to see more rows

+ set_int(where(is.numeric)) +#> # A tibble: 17 × 7 +#> Feature sumGain sumCover meanGain meanCover frequency mean5Gain +#> <chr> <int> <int> <int> <int> <int> <int> +#> 1 satisfaction_level:n… 5232 22360 402 1720 13 611 +#> 2 number_project:satis… 4141 8347 690 1391 6 802 +#> 3 time_spend_company:s… 3134 5716 783 1429 4 783 +#> 4 average_montly_hours… 2407 6255 343 893 7 411 +#> 5 last_evaluation:time… 2333 3709 388 618 6 448 +#> 6 satisfaction_level:t… 2213 6753 442 1351 5 442 +#> 7 number_project:time_… 2173 5587 724 1862 3 724 +#> 8 average_montly_hours… 1434 1780 478 593 3 478 +#> 9 last_evaluation:sati… 1162 5995 193 999 6 213 +#> 10 last_evaluation:numb… 1018 5364 203 1073 5 203 +#> 11 number_project:avera… 864 1579 288 526 3 288 +#> 12 average_montly_hours… 444 620 444 620 1 444 +#> 13 number_project:last_… 213 371 213 371 1 213 +#> 14 time_spend_company:l… 211 1352 211 1352 1 211 +#> 15 satisfaction_level:W… 99 1203 99 1203 1 99 +#> 16 Work_accident:time_s… 92 1077 92 1077 1 92 +#> 17 average_montly_hours… 88 1172 88 1172 1 88

We can extract all the trees that contain the specified interaction.

@@ -348,23 +357,37 @@ 

Analyze interactionsdistinct -> top_interaction_trees top_interaction_trees -#> # A tibble: 6 × 2 -#> Tree interactions -#> <int> <chr> -#> 1 2 time_spend_company:satisfaction_level -#> 2 7 time_spend_company:satisfaction_level -#> 3 15 time_spend_company:satisfaction_level -#> 4 28 time_spend_company:satisfaction_level -#> 5 30 time_spend_company:satisfaction_level -#> 6 99 time_spend_company:satisfaction_level

+#> # A tibble: 20 × 2 +#> Tree interactions +#> <int> <chr> +#> 1 0 satisfaction_level:number_project +#> 2 3 satisfaction_level:number_project +#> 3 4 satisfaction_level:number_project +#> 4 6 satisfaction_level:number_project +#> 5 8 satisfaction_level:number_project +#> 6 14 satisfaction_level:number_project +#> 7 17 satisfaction_level:number_project +#> 8 18 satisfaction_level:number_project +#> 9 19 satisfaction_level:number_project +#> 10 23 satisfaction_level:number_project +#> 11 24 satisfaction_level:number_project +#> 12 26 satisfaction_level:number_project +#> 13 27 satisfaction_level:number_project +#> 14 29 satisfaction_level:number_project +#> 15 30 satisfaction_level:number_project +#> 16 31 satisfaction_level:number_project +#> 17 34 satisfaction_level:number_project +#> 18 37 satisfaction_level:number_project +#> 19 46 satisfaction_level:number_project +#> 20 63 satisfaction_level:number_project

Then extract the first 3 (most important) trees and print them.

 top_interaction_trees$Tree %>% unique %>% head(3) -> trees_index
 
 
 xgboost::xgb.plot.tree(model = xg1, trees = trees_index)
-
-

We can confirm they are interactions because the child leaf in the +

+

We can confirm they are interactions because the child leaf in the interaction has higher split gain than the root leaf.

@@ -390,8 +413,8 @@

Analyze single features unlist -> top_trees xgboost::xgb.plot.tree(model = xg1, trees = top_trees)

-
-

By looking at the 3 most important splits for satisfaction_level we +

+

By looking at the 3 most important splits for satisfaction_level we can get a sense of how its splits affect the outcome.

@@ -404,51 +427,50 @@

shapley valueshr_shaps #> $shap_tbl #> # A tibble: 14,999 × 20 -#> satisfactio…¹ last_…² numbe…³ avera…⁴ time_…⁵ Work_…⁶ promo…⁷ sales…⁸ sales…⁹ -#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 1.91 0.420 1.32 0.984 -0.0731 0.113 0 0 0 -#> 2 0.287 0.535 0.183 0.613 1.65 0.102 0 0 0 -#> 3 5.51 0.248 3.76 0.128 0.113 0.0406 0 0 0 -#> 4 0.194 0.535 0.0280 0.349 2.17 0.107 0 0 0 -#> 5 1.91 0.420 1.32 0.984 -0.0731 0.113 0 0 0 -#> 6 1.91 0.420 1.32 0.984 -0.0731 0.113 0 0 0 -#> 7 6.62 -0.703 0.0543 0.0173 0.0558 0.0458 0 0 0 -#> 8 -0.154 0.538 0.146 0.623 2.29 0.107 0 0 0 -#> 9 0.199 2.21 0.0567 0.369 2.26 0.104 0 0 0 -#> 10 1.91 0.420 1.32 0.984 -0.0731 0.113 0 0 0 -#> # … with 14,989 more rows, 11 more variables: sales_IT <dbl>, -#> # sales_management <dbl>, sales_marketing <dbl>, sales_product_mng <dbl>, -#> # sales_RandD <dbl>, sales_sales <dbl>, sales_support <dbl>, -#> # sales_technical <dbl>, salary_high <dbl>, salary_low <dbl>, -#> # salary_medium <dbl>, and abbreviated variable names ¹​satisfaction_level, -#> # ²​last_evaluation, ³​number_project, ⁴​average_montly_hours, -#> # ⁵​time_spend_company, ⁶​Work_accident, ⁷​promotion_last_5years, … -#> # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names +#> satisfaction_level last_evaluation number_project average_montly_hours +#> <dbl> <dbl> <dbl> <dbl> +#> 1 0.870 0.0184 1.70 0.0718 +#> 2 -0.317 0.396 -0.214 0.222 +#> 3 3.66 0.157 -0.254 0.199 +#> 4 -0.316 0.396 -0.246 0.0769 +#> 5 0.870 0.0184 1.70 0.0718 +#> 6 0.870 0.0184 1.70 0.0718 +#> 7 3.73 -0.174 -0.304 0.173 +#> 8 -0.316 0.396 -0.214 0.223 +#> 9 -0.297 0.949 -0.237 0.0880 +#> 10 0.870 0.0184 1.70 0.0718 +#> # ℹ 14,989 more rows +#> # ℹ 16 more variables: time_spend_company <dbl>, Work_accident <dbl>, +#> # promotion_last_5years <dbl>, sales_accounting <dbl>, sales_hr <dbl>, +#> # sales_it <dbl>, sales_management <dbl>, sales_marketing <dbl>, +#> # sales_product_mng <dbl>, sales_rand_d <dbl>, sales_sales <dbl>, +#> # sales_support <dbl>, sales_technical <dbl>, salary_high <dbl>, +#> # salary_low <dbl>, salary_medium <dbl> #> #> $shap_summary #> # A tibble: 20 × 5 -#> name cor var sum sum_abs -#> <chr> <dbl> <dbl> <dbl> <dbl> -#> 1 satisfaction_level -0.668 4.76 -5354. 23838. -#> 2 time_spend_company 0.328 1.22 -7864. 14622. -#> 3 average_montly_hours 0.348 1.52 -183. 10695. -#> 4 number_project 0.0409 0.690 -1867. 9367. -#> 5 last_evaluation 0.391 0.733 -3603. 8745. -#> 6 Work_accident -0.989 0.109 -495. 3090. -#> 7 salary_low 0.995 0.0421 -195. 3064. -#> 8 salary_high -0.987 0.0429 -263. 1468. -#> 9 sales_product_mng -0.963 0.00691 47.3 613. -#> 10 sales_technical 0.894 0.00123 -15.6 372. -#> 11 sales_RandD -0.967 0.00268 -0.303 335. -#> 12 sales_IT -0.992 0.000708 -3.97 214. -#> 13 salary_medium 0.410 0.000412 -12.5 125. -#> 14 promotion_last_5years NA 0 0 0 -#> 15 sales_accounting NA 0 0 0 -#> 16 sales_hr NA 0 0 0 -#> 17 sales_management NA 0 0 0 -#> 18 sales_marketing NA 0 0 0 -#> 19 sales_sales NA 0 0 0 -#> 20 sales_support NA 0 0 0 +#> name cor var sum sum_abs +#> <chr> <dbl> <dbl> <dbl> <dbl> +#> 1 satisfaction_level -0.761 1.13 -854. 11395. +#> 2 number_project -0.580 0.505 -1233. 8515. +#> 3 time_spend_company 0.743 0.383 -2615. 7237. +#> 4 last_evaluation 0.674 0.0610 -288. 3082. +#> 5 average_montly_hours 0.669 0.0285 -69.5 2029. +#> 6 Work_accident -0.995 0.00758 -87.0 852. +#> 7 salary_low 0.981 0.00135 -27.3 541. +#> 8 promotion_last_5years NA 0 0 0 +#> 9 salary_high NA 0 0 0 +#> 10 salary_medium NA 0 0 0 +#> 11 sales_accounting NA 0 0 0 +#> 12 sales_hr NA 0 0 0 +#> 13 sales_it NA 0 0 0 +#> 14 sales_management NA 0 0 0 +#> 15 sales_marketing NA 0 0 0 +#> 16 sales_product_mng NA 0 0 0 +#> 17 sales_rand_d NA 0 0 0 +#> 18 sales_sales NA 0 0 0 +#> 19 sales_support NA 0 0 0 +#> 20 sales_technical NA 0 0 0 #> #> $swarmplot

@@ -485,7 +507,7 @@