From a48de3cb0f33725c884a33c3f6dd0010af4db30e Mon Sep 17 00:00:00 2001 From: "jeremie.desgagne.bouchard" Date: Wed, 4 Oct 2023 22:49:25 -0400 Subject: [PATCH] document support/handling of missings + tests --- .github/workflows/CI.yml | 2 +- .github/workflows/CompatHelper.yml | 2 +- .github/workflows/Docs.yml | 2 +- Project.toml | 2 +- README.md | 4 +- docs/src/index.md | 55 +++++++- .../tutorials/logistic-regression-titanic.md | 6 +- src/fit-utils.jl | 5 +- src/gpu/init.jl | 9 +- src/init.jl | 9 +- test/MLJ.jl | 2 - test/missings.jl | 123 ++++++++++++++++++ test/runtests.jl | 1 + 13 files changed, 206 insertions(+), 16 deletions(-) create mode 100644 test/missings.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 40680d19..a056b7c2 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: version: '1' arch: x64 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v1 with: version: ${{ matrix.version }} diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 8afcca93..4594d229 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -10,7 +10,7 @@ jobs: steps: - uses: julia-actions/setup-julia@latest with: - version: 1.3 + version: 1.6 - name: Pkg.add("CompatHelper") run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - name: CompatHelper.main diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml index 6ad069bd..599d907e 100644 --- a/.github/workflows/Docs.yml +++ b/.github/workflows/Docs.yml @@ -10,7 +10,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@latest with: version: '1.6' diff --git a/Project.toml b/Project.toml index 5cc8bf4b..784acf1d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.16.1" +version = "0.16.2" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/README.md b/README.md index fcbe6a98..868eb775 100644 --- a/README.md +++ b/README.md @@ -103,9 +103,9 @@ preds = m(x_train) ### DataFrames input -When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. +When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features. -`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables. +`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables. ```julia dtrain = DataFrame(x_train, :auto) diff --git a/docs/src/index.md b/docs/src/index.md index 5a3c8ca0..28eee7ad 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -52,7 +52,7 @@ m = fit_evotree(config; x_train, y_train) preds = m(x_train) ``` -### DataFrames and Tables input +### Tables and DataFrames input When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. @@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu"); p = m(dtrain; device="gpu") ``` - ## Reproducibility EvoTrees models trained on cpu can be fully reproducible. @@ -107,6 +106,58 @@ Note that in presence of multiple identical or very highly correlated features, At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. +## Missing values + +### Features + +EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used). + +This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended: + +```julia +julia> x = Vector{Union{Missing, Float64}}([1, 2]) +2-element Vector{Union{Missing, Float64}}: + 1.0 + 2.0 + +julia> identity.(x) +2-element Vector{Float64}: + 1.0 + 2.0 +``` + +For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing: + +```julia +transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing) +``` + +Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model): + +```julia +transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat); +``` + +For unordered categorical variables, a recode of the missing into a non missing level is sufficient: +```julia +julia> x = categorical(["a", "b", missing]) +3-element CategoricalArray{Union{Missing, String},1,UInt32}: + "a" + "b" + missing + +julia> x = recode(x_cat_m1, missing => "missing value") +3-element CategoricalArray{String,1,UInt32}: + "a" + "b" + "missing value" +``` + +### Target + +Target variable must have its elements type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue` and `String` are also supported. + + ## Save/Load ```julia diff --git a/docs/src/tutorials/logistic-regression-titanic.md b/docs/src/tutorials/logistic-regression-titanic.md index 1910c3ec..c7ff636c 100644 --- a/docs/src/tutorials/logistic-regression-titanic.md +++ b/docs/src/tutorials/logistic-regression-titanic.md @@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe A first step in data processing is to prepare the input features in a model compatible format. -EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`. +EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables. A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`. -For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing. -Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model). +For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing. +Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model). ```julia # convert string feature to Categorical diff --git a/src/fit-utils.jl b/src/fit-utils.jl index b4bae526..f0fb0b93 100644 --- a/src/fit-utils.jl +++ b/src/fit-utils.jl @@ -2,9 +2,10 @@ get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T} get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG()) -Get the braking points of the feature data. +Get the histogram breaking points of the feature data. """ function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T} + @assert T <: Real nobs = min(size(X, 1), 1000 * nbins) idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true) nfeats = size(X, 2) @@ -80,6 +81,8 @@ function binarize(df; fnames, edges) x_bin[:, j] .= levelcode.(col) elseif eltype(col) <: Real x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col) + else + @error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))" end end return x_bin diff --git a/src/gpu/init.jl b/src/gpu/init.jl index e4661182..3c3d682d 100644 --- a/src/gpu/init.jl +++ b/src/gpu/init.jl @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o target_levels = nothing if L == Logistic + @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1 K = 1 y = T.(y_train) μ = [logit(mean(y))] !isnothing(offset) && (offset .= logit.(offset)) elseif L in [Poisson, Gamma, Tweedie] + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = fill(log(mean(y)), 1) @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o if eltype(y_train) <: CategoricalValue target_levels = CategoricalArrays.levels(y_train) y = UInt32.(CategoricalArrays.levelcode.(y_train)) - else + elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char target_levels = sort(unique(y_train)) yc = CategoricalVector(y_train, levels=target_levels) y = UInt32.(CategoricalArrays.levelcode.(yc)) + else + @error "Invalid target eltype: $(eltype(y_train))" end K = length(target_levels) μ = T.(log.(proportions(y, UInt32(1):UInt32(K)))) μ .-= maximum(μ) !isnothing(offset) && (offset .= log.(offset)) elseif L == GaussianMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y))] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) elseif L == LogisticMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y) * sqrt(3) / π)] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) else + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = [mean(y)] diff --git a/src/init.jl b/src/init.jl index a6c8d299..be421e6b 100644 --- a/src/init.jl +++ b/src/init.jl @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o target_levels = nothing if L == Logistic + @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1 K = 1 y = T.(y_train) μ = [logit(mean(y))] !isnothing(offset) && (offset .= logit.(offset)) elseif L in [Poisson, Gamma, Tweedie] + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = fill(log(mean(y)), 1) @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o if eltype(y_train) <: CategoricalValue target_levels = CategoricalArrays.levels(y_train) y = UInt32.(CategoricalArrays.levelcode.(y_train)) - else + elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char target_levels = sort(unique(y_train)) yc = CategoricalVector(y_train, levels=target_levels) y = UInt32.(CategoricalArrays.levelcode.(yc)) + else + @error "Invalid target eltype: $(eltype(y_train))" end K = length(target_levels) μ = T.(log.(proportions(y, UInt32(1):UInt32(K)))) μ .-= maximum(μ) !isnothing(offset) && (offset .= log.(offset)) elseif L == GaussianMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y))] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) elseif L == LogisticMLE + @assert eltype(y_train) <: Real K = 2 y = T.(y_train) μ = [mean(y), log(std(y) * sqrt(3) / π)] !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2])) else + @assert eltype(y_train) <: Real K = 1 y = T.(y_train) μ = [mean(y)] diff --git a/test/MLJ.jl b/test/MLJ.jl index 740b32c2..906503cf 100644 --- a/test/MLJ.jl +++ b/test/MLJ.jl @@ -121,12 +121,10 @@ fit!(mach, rows=train, verbosity=1) pred_train = predict(mach, selectrows(X, train)) pred_train_mode = predict_mode(mach, selectrows(X, train)) -cross_entropy(pred_train, selectrows(y, train)) |> mean sum(pred_train_mode .== y[train]) / length(y[train]) pred_test = predict(mach, selectrows(X, test)) pred_test_mode = predict_mode(mach, selectrows(X, test)) -cross_entropy(pred_test, selectrows(y, test)) |> mean sum(pred_test_mode .== y[test]) / length(y[test]) pred_test_mode = predict_mode(mach, selectrows(X, test)) diff --git a/test/missings.jl b/test/missings.jl new file mode 100644 index 00000000..d4edddb8 --- /dev/null +++ b/test/missings.jl @@ -0,0 +1,123 @@ +using Statistics +using StatsBase: sample +using EvoTrees: sigmoid, logit +using EvoTrees: check_args, check_parameter +using CategoricalArrays +using DataFrames +using Random: seed! + +# prepare a dataset +seed!(123) +nobs = 1_000 +x_num = rand(nobs) .* 5 +lvls = ["a", "b", "c"] +x_cat = categorical(rand(lvls, nobs), levels=lvls, ordered=false) +x_bool = rand(Bool, nobs) + +x_num_m1 = Vector{Union{Missing,Float64}}(copy(x_num)) +x_num_m2 = Vector{Any}(copy(x_num)) +lvls_m1 = ["a", "b", "c", missing] +x_cat_m1 = categorical(rand(lvls_m1, nobs), levels=lvls) +x_bool_m1 = Vector{Union{Missing,Bool}}(copy(x_bool)) + +# train-eval split +is = collect(1:nobs) +i_sample = sample(is, nobs, replace=false) +train_size = 0.8 +i_train = i_sample[1:floor(Int, train_size * nobs)] +i_eval = i_sample[floor(Int, train_size * nobs)+1:end] + +# target var +y_tot = sin.(x_num) .* 0.5 .+ 0.5 +y_tot = logit(y_tot) + randn(nobs) +y_tot = sigmoid(y_tot) +target_name = "y" +y_tot = sigmoid(y_tot) +y_tot_m1 = allowmissing(y_tot) +y_tot_m1[1] = missing + +config = EvoTreeRegressor( + loss=:linear, + nrounds=100, + nbins=16, + lambda=0.5, + gamma=0.1, + eta=0.05, + max_depth=3, + min_weight=1.0, + rowsample=0.5, + colsample=1.0, + rng=123, +) + +@testset "DataFrames - missing features" begin + + df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + model = fit_evotree( + config, + dtrain; + target_name) + + @test model.info[:fnames] == [:x_num, :x_bool, :x_cat] + + # keep only fnames <= Real or Categorical + df_tot = DataFrame(x_num=x_num, x_num_m1=x_num_m1, x_num_m2=x_num_m2, + x_cat_m1=x_cat_m1, x_bool_m1=x_bool_m1, y=y_tot) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + model = fit_evotree( + config, + dtrain; + target_name, + deval) + + @test model.info[:fnames] == [:x_num] + + model = fit_evotree( + config, + dtrain; + target_name, + fnames=[:x_num]) + + @test model.info[:fnames] == [:x_num] + + # specifyin features with missings should error + @test_throws AssertionError fit_evotree( + config, + dtrain; + deval, + fnames=[:x_num, :x_num_m1, :x_num_m2, :x_cat_m1, :x_bool_m1], + target_name) + +end + +@testset "DataFrames - missing in target errors" begin + + df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot_m1) + dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :] + + @test_throws AssertionError fit_evotree( + config, + dtrain; + target_name) + +end + +@testset "Matrix - missing features" begin + + x_tot = allowmissing(hcat(x_num_m1)) + @test_throws AssertionError fit_evotree( + config; + x_train=x_tot, + y_train=y_tot) + + x_tot = Matrix{Any}(hcat(x_num_m2)) + @test_throws AssertionError fit_evotree( + config; + x_train=x_tot, + y_train=y_tot) + +end + diff --git a/test/runtests.jl b/test/runtests.jl index 891f6d34..a4228a49 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,7 @@ using Test include("oblivious.jl") include("tables.jl") include("monotonic.jl") + include("missings.jl") end @testset "MLJ" begin