From cc1466e793114a897ce0654fb9a92c8ec5ec92b3 Mon Sep 17 00:00:00 2001 From: "jeremie.desgagne.bouchard" Date: Mon, 30 Oct 2023 23:30:37 -0400 Subject: [PATCH] higgs benchmark --- Project.toml | 12 ++--- benchmarks/Higgs-logloss.jl | 93 +++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 benchmarks/Higgs-logloss.jl diff --git a/Project.toml b/Project.toml index e00b2c2..1148da9 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +EvoTreesCUDAExt = "CUDA" + [compat] BSON = "0.3" CUDA = "3.0, 4.0, 5.0" @@ -29,9 +35,6 @@ StatsBase = "0.32, 0.33, 0.34" Tables = "1.9" julia = "1.6" -[extensions] -EvoTreesCUDAExt = "CUDA" - [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" @@ -43,6 +46,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] docs = ["Documenter"] test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"] - -[weakdeps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" diff --git a/benchmarks/Higgs-logloss.jl b/benchmarks/Higgs-logloss.jl new file mode 100644 index 0000000..8b5c61d --- /dev/null +++ b/benchmarks/Higgs-logloss.jl @@ -0,0 +1,93 @@ +using Revise +using Random +using CSV +using DataFrames +using StatsBase +using Statistics: mean, std +using EvoTrees +using Solage: Connectors +using AWS: AWSCredentials, AWSConfig, @service + +@service S3 +aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"]) +aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1") +bucket = "jeremiedb" + +path = "share/data/higgs/HIGGS.arrow" +df_tot = Connectors.read_arrow_aws(path; bucket="jeremiedb", aws_config) + +rename!(df_tot, "Column1" => "y") +feature_names = setdiff(names(df_tot), ["y"]) +target_name = "y" + +function percent_rank(x::AbstractVector{T}) where {T} + return tiedrank(x) / (length(x) + 1) +end + +transform!(df_tot, feature_names .=> percent_rank .=> feature_names) + +dtrain = df_tot[1:end-500_000, :]; +deval = df_tot[end-500_000+1:end, :]; +dtest = df_tot[end-500_000+1:end, :]; + +config = EvoTreeRegressor( + loss=:logloss, + nrounds=5000, + eta=0.15, + nbins=128, + max_depth=9, + lambda=1.0, + gamma=0.0, + rowsample=0.8, + colsample=0.8, + min_weight=1, + rng=123, +) + +device = "gpu" +metric = "logloss" +@time m_evo = fit_evotree(config, dtrain; target_name, fnames=feature_names, deval, metric, device, early_stopping_rounds=200, print_every_n=100); + +p_test = m_evo(dtest); +@info extrema(p_test) +logloss_test = mean(-dtest.y .* log.(p_test) .+ (dtest.y .- 1) .* log.(1 .- p_test)) +@info "LogLoss - dtest" logloss_test +error_test = 1 - mean(round.(Int, p_test) .== dtest.y) +@info "ERROR - dtest" error_test +# ┌ Info: LogLoss - dtest +# └ logloss_test = 0.4716574579097044 +# ┌ Info: ERROR - dtest +# └ error_test = 0.229522 + +@info "XGBoost" +@info "train" +using XGBoost +params_xgb = Dict( + :num_round => 4000, + :max_depth => 8, + :eta => 0.15, + :objective => "reg:logistic", + :print_every_n => 5, + :gamma => 0, + :lambda => 1, + :subsample => 0.8, + :colsample_bytree => 0.8, + :tree_method => "gpu_hist", # hist/gpu_hist + :max_bin => 128, +) + +dtrain_xgb = DMatrix(select(dtrain, feature_names), dtrain.y) +watchlist = Dict("eval" => DMatrix(select(deval, feature_names), deval.y)); +@time m_xgb = xgboost(dtrain_xgb; watchlist, nthread=Threads.nthreads(), verbosity=0, eval_metric="logloss", params_xgb...); + +pred_xgb = XGBoost.predict(m_xgb, DMatrix(select(deval, feature_names))); +@info extrema(pred_xgb) +# (1.9394008f-6, 0.9999975f0) +logloss_test = mean(-dtest.y .* log.(pred_xgb) .+ (dtest.y .- 1) .* log.(1 .- pred_xgb)) +@info "LogLoss - dtest" logloss_test +error_test = 1 - mean(round.(Int, pred_xgb) .== dtest.y) +@info "ERROR - xgb test" error_test +# ┌ Info: LogLoss - dtest +# └ logloss_test = 0.4710665675338929 +# ┌ Info: ERROR - xgb test +# └ error_test = 0.22987999999999997