From a48de3cb0f33725c884a33c3f6dd0010af4db30e Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Wed, 4 Oct 2023 22:49:25 -0400
Subject: [PATCH] document support/handling of missings + tests

---
 .github/workflows/CI.yml                      |   2 +-
 .github/workflows/CompatHelper.yml            |   2 +-
 .github/workflows/Docs.yml                    |   2 +-
 Project.toml                                  |   2 +-
 README.md                                     |   4 +-
 docs/src/index.md                             |  55 +++++++-
 .../tutorials/logistic-regression-titanic.md  |   6 +-
 src/fit-utils.jl                              |   5 +-
 src/gpu/init.jl                               |   9 +-
 src/init.jl                                   |   9 +-
 test/MLJ.jl                                   |   2 -
 test/missings.jl                              | 123 ++++++++++++++++++
 test/runtests.jl                              |   1 +
 13 files changed, 206 insertions(+), 16 deletions(-)
 create mode 100644 test/missings.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 40680d19..a056b7c2 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -31,7 +31,7 @@ jobs:
             version: '1'
             arch: x64
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 8afcca93..4594d229 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -10,7 +10,7 @@ jobs:
     steps:
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.3
+          version: 1.6
       - name: Pkg.add("CompatHelper")
         run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
       - name: CompatHelper.main
diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml
index 6ad069bd..599d907e 100644
--- a/.github/workflows/Docs.yml
+++ b/.github/workflows/Docs.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1.6'
diff --git a/Project.toml b/Project.toml
index 5cc8bf4b..784acf1d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <jeremie.db@evovest.com>"]
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
diff --git a/README.md b/README.md
index fcbe6a98..868eb775 100644
--- a/README.md
+++ b/README.md
@@ -103,9 +103,9 @@ preds = m(x_train)
 
 ### DataFrames input
 
-When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
+When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features. 
 
-`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
+`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables.
 
 ```julia
 dtrain = DataFrame(x_train, :auto)
diff --git a/docs/src/index.md b/docs/src/index.md
index 5a3c8ca0..28eee7ad 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -52,7 +52,7 @@ m = fit_evotree(config; x_train, y_train)
 preds = m(x_train)
 ```
 
-### DataFrames and Tables input
+### Tables and DataFrames input
 
 When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used. 
 
@@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu");
 p = m(dtrain; device="gpu")
 ```
 
-
 ## Reproducibility
 
 EvoTrees models trained on cpu can be fully reproducible.
@@ -107,6 +106,58 @@ Note that in presence of multiple identical or very highly correlated features,
 
 At the moment, there's no reproducibility guarantee on GPU, although this may change in the future. 
 
+## Missing values
+
+### Features
+
+EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used).
+
+This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended: 
+
+```julia
+julia> x = Vector{Union{Missing, Float64}}([1, 2])
+2-element Vector{Union{Missing, Float64}}:
+ 1.0
+ 2.0
+
+julia> identity.(x)
+2-element Vector{Float64}:
+ 1.0
+ 2.0
+```
+
+For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing:
+
+```julia
+transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing)
+```
+
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model):
+
+```julia
+transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat);
+```
+
+For unordered categorical variables, a recode of the missing into a non missing level is sufficient:
+```julia
+julia> x = categorical(["a", "b", missing])
+3-element CategoricalArray{Union{Missing, String},1,UInt32}:
+ "a"
+ "b"
+ missing
+
+julia> x = recode(x_cat_m1, missing => "missing value")
+3-element CategoricalArray{String,1,UInt32}:
+ "a"
+ "b"
+ "missing value"
+```
+
+### Target
+
+Target variable must have its elements type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue` and `String` are also supported.
+
+
 ## Save/Load
 
 ```julia
diff --git a/docs/src/tutorials/logistic-regression-titanic.md b/docs/src/tutorials/logistic-regression-titanic.md
index 1910c3ec..c7ff636c 100644
--- a/docs/src/tutorials/logistic-regression-titanic.md
+++ b/docs/src/tutorials/logistic-regression-titanic.md
@@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe
 
 A first step in data processing is to prepare the input features in a model compatible format. 
 
-EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`.
+EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables.
 A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`. 
 
-For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
-Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
+For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
+Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
 
 ```julia
 # convert string feature to Categorical
diff --git a/src/fit-utils.jl b/src/fit-utils.jl
index b4bae526..f0fb0b93 100644
--- a/src/fit-utils.jl
+++ b/src/fit-utils.jl
@@ -2,9 +2,10 @@
     get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
     get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
 
-Get the braking points of the feature data.
+Get the histogram breaking points of the feature data.
 """
 function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T}
+    @assert T <: Real
     nobs = min(size(X, 1), 1000 * nbins)
     idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
     nfeats = size(X, 2)
@@ -80,6 +81,8 @@ function binarize(df; fnames, edges)
             x_bin[:, j] .= levelcode.(col)
         elseif eltype(col) <: Real
             x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col)
+        else
+            @error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))"
         end
     end
     return x_bin
diff --git a/src/gpu/init.jl b/src/gpu/init.jl
index e4661182..3c3d682d 100644
--- a/src/gpu/init.jl
+++ b/src/gpu/init.jl
@@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
 
     target_levels = nothing
     if L == Logistic
+        @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
         μ = [logit(mean(y))]
         !isnothing(offset) && (offset .= logit.(offset))
     elseif L in [Poisson, Gamma, Tweedie]
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = fill(log(mean(y)), 1)
@@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
         if eltype(y_train) <: CategoricalValue
             target_levels = CategoricalArrays.levels(y_train)
             y = UInt32.(CategoricalArrays.levelcode.(y_train))
-        else
+        elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
             yc = CategoricalVector(y_train, levels=target_levels)
             y = UInt32.(CategoricalArrays.levelcode.(yc))
+        else
+            @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
         μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
     elseif L == GaussianMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     elseif L == LogisticMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = [mean(y)]
diff --git a/src/init.jl b/src/init.jl
index a6c8d299..be421e6b 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o
 
     target_levels = nothing
     if L == Logistic
+        @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
         μ = [logit(mean(y))]
         !isnothing(offset) && (offset .= logit.(offset))
     elseif L in [Poisson, Gamma, Tweedie]
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = fill(log(mean(y)), 1)
@@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{CPU}, data, fnames, y_train, w, o
         if eltype(y_train) <: CategoricalValue
             target_levels = CategoricalArrays.levels(y_train)
             y = UInt32.(CategoricalArrays.levelcode.(y_train))
-        else
+        elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
             yc = CategoricalVector(y_train, levels=target_levels)
             y = UInt32.(CategoricalArrays.levelcode.(yc))
+        else
+            @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
         μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
     elseif L == GaussianMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     elseif L == LogisticMLE
+        @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
         μ = [mean(y), log(std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
+        @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
         μ = [mean(y)]
diff --git a/test/MLJ.jl b/test/MLJ.jl
index 740b32c2..906503cf 100644
--- a/test/MLJ.jl
+++ b/test/MLJ.jl
@@ -121,12 +121,10 @@ fit!(mach, rows=train, verbosity=1)
 
 pred_train = predict(mach, selectrows(X, train))
 pred_train_mode = predict_mode(mach, selectrows(X, train))
-cross_entropy(pred_train, selectrows(y, train)) |> mean
 sum(pred_train_mode .== y[train]) / length(y[train])
 
 pred_test = predict(mach, selectrows(X, test))
 pred_test_mode = predict_mode(mach, selectrows(X, test))
-cross_entropy(pred_test, selectrows(y, test)) |> mean
 sum(pred_test_mode .== y[test]) / length(y[test])
 pred_test_mode = predict_mode(mach, selectrows(X, test))
 
diff --git a/test/missings.jl b/test/missings.jl
new file mode 100644
index 00000000..d4edddb8
--- /dev/null
+++ b/test/missings.jl
@@ -0,0 +1,123 @@
+using Statistics
+using StatsBase: sample
+using EvoTrees: sigmoid, logit
+using EvoTrees: check_args, check_parameter
+using CategoricalArrays
+using DataFrames
+using Random: seed!
+
+# prepare a dataset
+seed!(123)
+nobs = 1_000
+x_num = rand(nobs) .* 5
+lvls = ["a", "b", "c"]
+x_cat = categorical(rand(lvls, nobs), levels=lvls, ordered=false)
+x_bool = rand(Bool, nobs)
+
+x_num_m1 = Vector{Union{Missing,Float64}}(copy(x_num))
+x_num_m2 = Vector{Any}(copy(x_num))
+lvls_m1 = ["a", "b", "c", missing]
+x_cat_m1 = categorical(rand(lvls_m1, nobs), levels=lvls)
+x_bool_m1 = Vector{Union{Missing,Bool}}(copy(x_bool))
+
+# train-eval split
+is = collect(1:nobs)
+i_sample = sample(is, nobs, replace=false)
+train_size = 0.8
+i_train = i_sample[1:floor(Int, train_size * nobs)]
+i_eval = i_sample[floor(Int, train_size * nobs)+1:end]
+
+# target var
+y_tot = sin.(x_num) .* 0.5 .+ 0.5
+y_tot = logit(y_tot) + randn(nobs)
+y_tot = sigmoid(y_tot)
+target_name = "y"
+y_tot = sigmoid(y_tot)
+y_tot_m1 = allowmissing(y_tot)
+y_tot_m1[1] = missing
+
+config = EvoTreeRegressor(
+    loss=:linear,
+    nrounds=100,
+    nbins=16,
+    lambda=0.5,
+    gamma=0.1,
+    eta=0.05,
+    max_depth=3,
+    min_weight=1.0,
+    rowsample=0.5,
+    colsample=1.0,
+    rng=123,
+)
+
+@testset "DataFrames - missing features" begin
+
+    df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name)
+
+    @test model.info[:fnames] == [:x_num, :x_bool, :x_cat]
+
+    # keep only fnames <= Real or Categorical
+    df_tot = DataFrame(x_num=x_num, x_num_m1=x_num_m1, x_num_m2=x_num_m2,
+        x_cat_m1=x_cat_m1, x_bool_m1=x_bool_m1, y=y_tot)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name,
+        deval)
+
+    @test model.info[:fnames] == [:x_num]
+
+    model = fit_evotree(
+        config,
+        dtrain;
+        target_name,
+        fnames=[:x_num])
+
+    @test model.info[:fnames] == [:x_num]
+
+    # specifyin features with missings should error
+    @test_throws AssertionError fit_evotree(
+        config,
+        dtrain;
+        deval,
+        fnames=[:x_num, :x_num_m1, :x_num_m2, :x_cat_m1, :x_bool_m1],
+        target_name)
+
+end
+
+@testset "DataFrames - missing in target errors" begin
+
+    df_tot = DataFrame(x_num=x_num, x_bool=x_bool, x_cat=x_cat, y=y_tot_m1)
+    dtrain, deval = df_tot[i_train, :], df_tot[i_eval, :]
+
+    @test_throws AssertionError fit_evotree(
+        config,
+        dtrain;
+        target_name)
+
+end
+
+@testset "Matrix - missing features" begin
+
+    x_tot = allowmissing(hcat(x_num_m1))
+    @test_throws AssertionError fit_evotree(
+        config;
+        x_train=x_tot,
+        y_train=y_tot)
+
+    x_tot = Matrix{Any}(hcat(x_num_m2))
+    @test_throws AssertionError fit_evotree(
+        config;
+        x_train=x_tot,
+        y_train=y_tot)
+
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 891f6d34..a4228a49 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,6 +12,7 @@ using Test
         include("oblivious.jl")
         include("tables.jl")
         include("monotonic.jl")
+        include("missings.jl")
     end
 
     @testset "MLJ" begin