Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random Forests #44

Open
szilard opened this issue Oct 1, 2020 · 11 comments
Open

Random Forests #44

szilard opened this issue Oct 1, 2020 · 11 comments
Labels

Comments

@szilard
Copy link
Owner

szilard commented Oct 1, 2020

c5.9xlarge (18 cores, HT off):

1M:

Lightgbm:

suppressMessages({
library(data.table)
library(ROCR)
library(lightgbm)
library(Matrix)
})

set.seed(123)

d_train <- fread("train-1m.csv", showProgress=FALSE)
d_test <- fread("test.csv", showProgress=FALSE)

d_all <- rbind(d_train, d_test)
d_all$dep_delayed_15min <- ifelse(d_all$dep_delayed_15min=="Y",1,0)

d_all_wrules <- lgb.convert_with_rules(d_all)       
d_all <- d_all_wrules$data
cols_cats <- names(d_all_wrules$rules) 

d_train <- d_all[1:nrow(d_train)]
d_test <- d_all[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test))]

p <- ncol(d_all)-1
dlgb_train <- lgb.Dataset(data = as.matrix(d_train[,1:p]), label = d_train$dep_delayed_15min)

auc <- function() {
  phat <- predict(md, data = as.matrix(d_test[,1:p]))
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
}


system.time({
  md <- lgb.train(data = dlgb_train, 
            objective = "binary", 
            nrounds = 100, num_leaves = 512, learning_rate = 0.1, 
            categorical_feature = cols_cats,
            verbose = 2)
})
auc()


system.time({
  md <- lgb.train(data = dlgb_train, 
            objective = "binary", 
            nrounds = 100, max_depth = 10, num_leaves = 2**17, 
            boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
            categorical_feature = cols_cats,
            verbose = 2)
})
auc()

Results:

GBM:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, num_leaves = 512, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.004295 seconds
[LightGBM] [Debug] col-wise cost 0.006771 seconds, row-wise cost 0.000792 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 16
...
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 23
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 24
[LightGBM] [Debug] Trained a tree with leaves = 512 and max_depth = 21
   user  system elapsed
 57.506   0.191   3.258
> auc()
0.7650181
> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 10, num_leaves = 2**17, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.004492 seconds
[LightGBM] [Debug] col-wise cost 0.007450 seconds, row-wise cost 0.000537 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 876 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 895 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 910 and max_depth = 10
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 650 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 745 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 672 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 759 and max_depth = 10
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 649 and max_depth = 10
   user  system elapsed
 53.896   0.227   3.058
> auc()
0.7614953

RF:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 10, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.004629 seconds
[LightGBM] [Debug] col-wise cost 0.001792 seconds, row-wise cost 0.000410 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 405 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 331 and max_depth = 10
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 677 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 736 and max_depth = 10
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 377 and max_depth = 10
   user  system elapsed
 42.253   0.191   2.364
> auc()
0.7314994
> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 15, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000005 seconds, init for row-wise cost 0.004689 seconds
[LightGBM] [Debug] col-wise cost 0.001777 seconds, row-wise cost 0.000313 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 933 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 635 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 632394 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 4024 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631446 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 767 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631800 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 3809 and max_depth = 15
...
[LightGBM] [Debug] Re-bagging, using 632325 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 3902 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 4475 and max_depth = 15
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 754 and max_depth = 15
   user  system elapsed
217.521   2.950  12.288
> auc()
0.7392125
> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 20, num_leaves = 2**17,
+             boosting_type = "rf", bagging_freq = 1, bagging_fraction = 0.632, feature_fraction = 1/sqrt(p),
+             categorical_feature = cols_cats,
+             verbose = 2)
+ })
[LightGBM] [Info] Number of positive: 192982, number of negative: 807018
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000818
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.004546 seconds
[LightGBM] [Debug] col-wise cost 0.001789 seconds, row-wise cost 0.000315 seconds
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1000000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192982 -> initscore=-1.430749
[LightGBM] [Info] Start training from score -1.430749
[LightGBM] [Debug] Re-bagging, using 632548 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 960 and max_depth = 17
[LightGBM] [Debug] Re-bagging, using 631955 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 675 and max_depth = 18
[LightGBM] [Debug] Re-bagging, using 632394 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 5528 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 631446 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 842 and max_depth = 19
...
[LightGBM] [Debug] Re-bagging, using 632325 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 7083 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 632031 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 7377 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 631444 data to train
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Debug] Trained a tree with leaves = 763 and max_depth = 17
   user  system elapsed
484.497   9.613  27.724
> auc()
0.7415699

GBM deep:

> system.time({
+   md <- lgb.train(data = dlgb_train,
+             objective = "binary",
+             nrounds = 100, max_depth = 20, num_leaves = 2**17, learning_rate = 0.1,
+             categorical_feature = cols_cats,
+             verbose = 0)
+ })
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
...
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
    user   system  elapsed
1846.680   16.217  103.627
> auc()
0.7704145
@szilard
Copy link
Owner Author

szilard commented Oct 1, 2020

xgboost:

suppressMessages({
library(data.table)
library(ROCR)
library(xgboost)
library(Matrix)
})

set.seed(123)

d_train <- fread("train-1m.csv", showProgress=FALSE)
d_test <- fread("test.csv", showProgress=FALSE)


X_train_test <- sparse.model.matrix(dep_delayed_15min ~ .-1, data = rbind(d_train, d_test))
n1 <- nrow(d_train)
n2 <- nrow(d_test)
X_train <- X_train_test[1:n1,]
X_test <- X_train_test[(n1+1):(n1+n2),]

dxgb_train <- xgb.DMatrix(data = X_train, label = ifelse(d_train$dep_delayed_15min=='Y',1,0))

auc <- function() {
  phat <- predict(md, newdata = X_test)
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
}

length(X_train@x)/nrow(X_train)



system.time({
  md <- xgb.train(data = dxgb_train, 
            objective = "binary:logistic", 
            nround = 100, max_depth = 10, eta = 0.1, 
            tree_method = "hist")
})
auc()


system.time({
  md <- xgb.train(data = dxgb_train, 
            objective = "binary:logistic", 
            nround = 1, num_parallel_tree = 100, max_depth = 10, 
            subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
            tree_method = "hist")
})
auc()

GBM:

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 100, max_depth = 10, eta = 0.1,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1314 extra nodes, 0 pruned nodes, max_depth=10
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1324 extra nodes, 0 pruned nodes, max_depth=10
[07:25:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
...
[07:25:53] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:53] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 550 extra nodes, 0 pruned nodes, max_depth=10
[07:25:53] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:25:53] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 762 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 63.519   0.056   3.833
> auc()
0.7478858

RF:

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 10,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:28:02] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:28:02] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:28:02] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 572 extra nodes, 0 pruned nodes, max_depth=10
[07:28:02] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1188 extra nodes, 0 pruned nodes, max_depth=10
[07:28:03] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 630 extra nodes, 0 pruned nodes, 
...
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 388 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 482 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 674 extra nodes, 0 pruned nodes, max_depth=10
[07:28:08] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 766 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 65.832   0.077   5.856
> auc()
0.730241
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 15,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:29:39] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:29:39] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:29:39] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2856 extra nodes, 0 pruned nodes, max_depth=15
[07:29:40] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1232 extra nodes, 0 pruned nodes, max_depth=15
...
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3292 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1312 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2982 extra nodes, 0 pruned nodes, max_depth=15
[07:29:47] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 6452 extra nodes, 0 pruned nodes, max_depth=15
   user  system elapsed
104.609   0.241   8.579
> auc()
0.7410314
> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 20,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:30:24] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:30:24] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:30:24] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2326 extra nodes, 0 pruned nodes, max_depth=20
[07:30:24] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2184 extra nodes, 0 pruned nodes, max_depth=20
[07:30:25] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 10138 extra nodes, 0 pruned nodes, max_depth=20
...
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 13074 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2148 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2186 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 2664 extra nodes, 0 pruned nodes, max_depth=20
[07:30:35] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3496 extra nodes, 0 pruned nodes, max_depth=20
   user  system elapsed
156.004   0.675  12.655
> auc()
0.7482527

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

xgboost with lambda=0 to better match lightgbm and build deeper trees (as per @Laurae2)

> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 10,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:33:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:33:50] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 478 extra nodes, 0 pruned nodes, max_depth=10
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 778 extra nodes, 0 pruned nodes, max_depth=10
[07:33:50] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 828 extra nodes, 0 pruned nodes, max_depth=10
...
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1086 extra nodes, 0 pruned nodes, max_depth=10
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 566 extra nodes, 0 pruned nodes, max_depth=10
[07:33:56] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 474 extra nodes, 0 pruned nodes, max_depth=10
   user  system elapsed
 70.681   0.007   6.126
> auc()
0.7305753
> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 15,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:35:11] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:35:11] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:35:11] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 1824 extra nodes, 0 pruned nodes, max_depth=15
[07:35:12] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 7730 extra nodes, 0 pruned nodes, max_depth=15
...
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5070 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 6534 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3456 extra nodes, 0 pruned nodes, max_depth=15
[07:35:20] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5598 extra nodes, 0 pruned nodes, max_depth=15
   user  system elapsed
126.601   0.104   9.989
> auc()
0.7406097
> system.time({
+   md <- xgb.train(data = dxgb_train,
+             objective = "binary:logistic",
+             nround = 1, num_parallel_tree = 100, max_depth = 20,
+             subsample = 0.632, colsample_bytree = 1/sqrt(length(X_train@x)/nrow(X_train)),
+             lambda = 0,
+             verbosity = 2,
+             tree_method = "hist")
+ })
[07:36:17] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:36:17] INFO: ../..//amalgamation/../src/gbm/gbtree.cc:177: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[07:36:17] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 15550 extra nodes, 0 pruned nodes, max_depth=20
[07:36:18] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 4658 extra nodes, 0 pruned nodes, max_depth=20
[07:36:18] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 3644 extra nodes, 0 pruned nodes, max_depth=20
...
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5448 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 5550 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 9408 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 13178 extra nodes, 0 pruned nodes, max_depth=20
[07:36:31] INFO: ../..//amalgamation/../src/tree/updater_prune.cc:101: tree pruning end, 4156 extra nodes, 0 pruned nodes, max_depth=20
   user  system elapsed
215.592   0.725  16.913
> auc()
0.7503208

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

Summary:

1M:

Tool Depth Time [s] AUC
lightgbm 10 2.3 0.7315
lightgbm 15 12.3 0.7392
lightgbm 20 27 0.7416
xgboost 10 5.8 0.7302
xgboost 15 8.6 0.7410
xgboost 20 12 0.7482
xgboost l=0 10 6.1 0.7306
xgboost l=0 15 10 0.7406
xgboost l=0 20 17 0.7503

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

h2o:

library(h2o)

h2o.init()

dx_train <- h2o.importFile("train-1m.csv")
dx_test <- h2o.importFile("test.csv")


Xnames <- names(dx_train)[which(names(dx_train)!="dep_delayed_15min")]


system.time({
  md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train, 
          ntrees = 100, max_depth = 10, 
          nbins = 100)
})
cat(h2o.auc(h2o.performance(md, dx_test)),"\n")

Results:

> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 10,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.168   0.004   9.215
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7372074


> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 15,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.279   0.007  33.379
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7499753


> system.time({
+   md <- h2o.randomForest(x = Xnames, y = "dep_delayed_15min", training_frame = dx_train,
+           ntrees = 100, max_depth = 20,
+           nbins = 100)
+ })
  |======================================================================| 100%
   user  system elapsed
  0.648   0.048 110.038
> cat(h2o.auc(h2o.performance(md, dx_test)),"\n")
0.7543568

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

Rborist:

library(data.table)
library(ROCR)
library(Matrix)
library(Rborist)

set.seed(123)

d_train <- fread("train-1m.csv")
d_test <- fread("test.csv")

X_train_test <- sparse.model.matrix(dep_delayed_15min ~ .-1, data = rbind(d_train, d_test))
X_train <- X_train_test[1:nrow(d_train),]
X_test <- X_train_test[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test)),]

auc <- function() {
  phat <- predict(md, newdata = X_test, ctgCensus="prob")$prob[,"Y"]
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min == "Y")
  performance(rocr_pred, "auc")@y.values[[1]]
}


system.time({
    md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=10, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
})
auc()

Results:

> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=10, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
240.358   9.202  25.243
> auc()
[1] 0.7198579
> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=15, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
417.049   8.086  35.107
> auc()
[1] 0.7309561
> system.time({
+     md <- Rborist(X_train, as.factor(d_train$dep_delayed_15min), nLevel=20, nTree=100, predProb = 1/sqrt(length(X_train@x)/nrow(X_train)), thinLeaves=TRUE)
+ })
   user  system elapsed
716.884   7.195  62.628
> auc()
[1] 0.7433575

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

ranger:

library(data.table)
library(ranger)
library(ROCR)

d_train <- fread("train-1m.csv")
d_test <- fread("test.csv")

d_train$dep_delayed_15min <- as.factor(d_train$dep_delayed_15min)
d_test$dep_delayed_15min  <- as.factor(d_test$dep_delayed_15min)

auc <- function() {
  phat <- predictions(predict(md, data = d_test))[,"Y"]
  rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
  performance(rocr_pred, "auc")@y.values[[1]]
}


system.time({
  md <- ranger(dep_delayed_15min ~ ., d_train, 
          num.trees = 100, max.depth = 10, probability = TRUE, write.forest = TRUE)
})
auc()

Results:

> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 10, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
143.398   0.024  10.850
> auc()
[1] 0.7116554
>
>
> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 15, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
216.044   0.080  16.971
> auc()
[1] 0.7191445
>
> system.time({
+   md <- ranger(dep_delayed_15min ~ ., d_train,
+           num.trees = 100, max.depth = 20, probability = TRUE, write.forest = TRUE)
+ })
   user  system elapsed
295.522   0.516  24.133
> auc()
[1] 0.72058

@szilard
Copy link
Owner Author

szilard commented Oct 2, 2020

So far (1M rows, c5.9xlarge, 18 cores, HT off):

Time [sec]:

Tool depth=10 depth=15 depth=20
xgboost 5.8 8.6 12
xgboost lamda=0 6.1 10 17
ranger 11 17 24
lightgbm 2.3 12 27
Rborist 25 35 62
h2o 9.2 33 110

@szilard
Copy link
Owner Author

szilard commented Feb 3, 2021

sklearn RF:


import pandas as pd
import numpy as np
from sklearn import preprocessing 
from scipy import sparse
from sklearn import metrics, ensemble


d_train = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv")
d_test = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/test.csv")


d_all = pd.concat([d_train,d_test])

vars_cat = ["Month","DayofMonth","DayOfWeek","UniqueCarrier", "Origin", "Dest"]
vars_num = ["DepTime","Distance"]
for col in vars_cat:
  d_all[col] = preprocessing.LabelEncoder().fit_transform(d_all[col])
  
X_all_cat = preprocessing.OneHotEncoder(categories="auto").fit_transform(d_all[vars_cat])   
X_all = sparse.hstack((X_all_cat, d_all[vars_num])).tocsr()                               
y_all = np.where(d_all["dep_delayed_15min"]=="Y",1,0)         

X_train = X_all[0:d_train.shape[0],]
y_train = y_all[0:d_train.shape[0]]
X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),]
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]


md = ensemble.RandomForestClassifier(max_depth = 10, n_estimators = 100, n_jobs = -1)
%time md.fit(X_train, y_train)

y_pred = md.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred))

Results:

md = ensemble.RandomForestClassifier(max_depth = 10, n_estimators = 100, n_jobs = -1)
Wall time: 9.79 s
0.703149121562214

md = ensemble.RandomForestClassifier(max_depth = 15, n_estimators = 100, n_jobs = -1)
Wall time: 20.6 s
0.7085553315997604

md = ensemble.RandomForestClassifier(max_depth = 20, n_estimators = 100, n_jobs = -1)
Wall time: 41.8 s
0.7144237796242365

@szilard
Copy link
Owner Author

szilard commented Feb 3, 2021

So far (1M rows, c5.9xlarge, 18 cores, HT off):

Time [sec]:

Tool depth=10 depth=15 depth=20
xgboost 5.8 8.6 12
xgboost lamda=0 6.1 10 17
ranger 11 17 24
lightgbm 2.3 12 27
sklearn 10 21 42
Rborist 25 35 62
h2o 9.2 33 110

@RAMitchell
Copy link

@szilard
Copy link
Owner Author

szilard commented Jun 6, 2021

Rforestry via @Laurae2

library(Rforestry)
library(data.table)
library(ROCR)

d_train <- fread("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv", stringsAsFactors=TRUE)
d_test_char <- fread("https://s3.amazonaws.com/benchm-ml--main/test.csv")
p <- 8

d_all <- rbind(d_train, d_test_char)
d_test <- d_all[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test_char))]


system.time({
md <- forestry(x = d_train[,1:p], y = d_train$dep_delayed_15min, ntree = 100, maxDepth = 10)
})


phat <- predict(md, d_test[,1:p]) 
rocr_pred <- prediction(phat, d_test$dep_delayed_15min)
cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")

Run (1M rows, depth=10, c5.9xlarge, 18 cores, HT off):

   user  system elapsed
654.304   8.341  38.285
>

> cat(performance(rocr_pred, "auc")@y.values[[1]],"\n")
0.719672

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants