Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missings #258

Merged
merged 4 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
version: '1'
arch: x64
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/CompatHelper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
steps:
- uses: julia-actions/setup-julia@latest
with:
version: 1.3
version: 1.6
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/Docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@latest
with:
version: '1.6'
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.16.1"
version = "0.16.2"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ preds = m(x_train)

### DataFrames input

When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.
When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features.

`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables.

```julia
dtrain = DataFrame(x_train, :auto)
Expand Down
85 changes: 54 additions & 31 deletions benchmarks/Yahoo-LTRC.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,26 @@ x_train = dtrain[:x][:, .!drop_cols]
x_eval = deval[:x][:, .!drop_cols]
x_test = dtest[:x][:, .!drop_cols]

# x_train_miss = x_train .== 0
# x_eval_miss = x_eval .== 0
# x_test_miss = x_test .== 0

# x_train[x_train.==0] .= 0.5
# x_eval[x_eval.==0] .= 0.5
# x_test[x_test.==0] .= 0.5

# x_train = hcat(x_train, x_train_miss)
# x_eval = hcat(x_eval, x_eval_miss)
# x_test = hcat(x_test, x_test_miss)

q_train = dtrain[:q]
q_eval = deval[:q]
q_test = dtest[:q]

y_train = dtrain[:y];
y_eval = deval[:y];
y_test = dtest[:y];

#####################################
# mse regression
#####################################
Expand Down Expand Up @@ -98,12 +114,12 @@ p_test = m_mse(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
@info "ndcg_test MSE" ndcg_test
@info "MSE - test data - MSE model" mean((p_test .- y_test) .^ 2)
@info "NDCG - test data - MSE model" ndcg_test

#####################################
# logistic regression
#####################################

max_rank = 4
y_train = dtrain[:y] ./ max_rank
y_eval = deval[:y] ./ max_rank
Expand Down Expand Up @@ -145,57 +161,67 @@ p_test = m_logloss(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
@info "ndcg_test LogLoss" ndcg_test

@info "NDCG - test data - LogLoss model" ndcg_test

#####################################
# logistic regression on DataFrame
#####################################
target_name = "y"

df_train = DataFrame(x_train, :auto)
df_train.y = dtrain[:y]
df_train.y = dtrain[:y] ./ 4
df_train.q = dtrain[:q]

df_eval = DataFrame(x_eval, :auto)
df_eval.y = deval[:y]
df_eval.y = deval[:y] ./ 4
df_eval.q = deval[:q]

df_test = DataFrame(x_test, :auto)
df_test.y = dtest[:y]
df_test.y = dtest[:y] ./ 4
df_test.q = dtest[:q]

function rank_target_norm(y::AbstractVector)
out = similar(y)
if minimum(y) == maximum(y)
# out .= 0.75
out .= 0.75
out .= 0.5
else
# out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5

out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
end
return out
end

df_train = transform!(
groupby(df_train, "q"),
"y" => rank_target_norm => "y")
function percent_rank(x::AbstractVector{T}) where {T}
return tiedrank(x) / (length(x) + 1)
end

feature_names_raw = setdiff(names(df_train), ["y", "q"])
feature_names_rel = feature_names_raw .* "_rel"

df_eval = transform!(
groupby(df_eval, "q"),
"y" => rank_target_norm => "y")
transform!(df_train, feature_names_raw .=> percent_rank .=> feature_names_rel)
transform!(df_eval, feature_names_raw .=> percent_rank .=> feature_names_rel)
transform!(df_test, feature_names_raw .=> percent_rank .=> feature_names_rel)

df_test = transform!(
groupby(df_test, "q"),
"y" => rank_target_norm => "y")
feature_names = setdiff(names(df_train), ["y", "q"])

# df_train = transform!(
# groupby(df_train, "q"),
# "y" => rank_target_norm => "y")

# df_eval = transform!(
# groupby(df_eval, "q"),
# "y" => rank_target_norm => "y")

# df_test = transform!(
# groupby(df_test, "q"),
# "y" => rank_target_norm => "y")

minimum(df_eval.y)
maximum(df_eval.y)

config = EvoTreeRegressor(
nrounds=6000,
loss=:logloss,
eta=0.005,
eta=0.01,
nbins=64,
max_depth=11,
rowsample=0.9,
Expand All @@ -205,28 +231,25 @@ config = EvoTreeRegressor(
@time m_logloss_df, logger_logloss_df = fit_evotree(
config,
df_train;
target_name="y",
fnames=setdiff(names(df_train), ["y", "q"]),
target_name,
fnames=feature_names_raw,
deval=df_eval,
early_stopping_rounds=200,
print_every_n=50,
metric=:logloss,
return_logger=true
);

# use the original y since NDCG is scale sensitive
y_train = dtrain[:y]
y_eval = deval[:y]
y_test = dtest[:y]

m_logloss_df.info
p_test_df = m_logloss_df(df_test);
p_test_mat = m_logloss_df(x_test);
# p_test_mat = m_logloss_df(x_test);

EvoTrees.importance(m_logloss_df)

p_test = m_logloss_df(df_test);
test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = mean(test_df_agg.ndcg)
@info "ndcg_test LogLoss DF" ndcg_test
# ndcg_test = 0.8022558972243291
# ndcg_test = 0.8020754563069513
@info "NDCG - test data - LogLoss DF model" ndcg_test
56 changes: 53 additions & 3 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ m = fit_evotree(config; x_train, y_train)
preds = m(x_train)
```

### DataFrames and Tables input
### Tables and DataFrames input

When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.
When using a `Tables` compatible input such as `DataFrames`, features with element type `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.

`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `≤` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.

Expand All @@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu");
p = m(dtrain; device="gpu")
```


## Reproducibility

EvoTrees models trained on cpu can be fully reproducible.
Expand Down Expand Up @@ -107,6 +106,57 @@ Note that in presence of multiple identical or very highly correlated features,

At the moment, there's no reproducibility guarantee on GPU, although this may change in the future.

## Missing values

### Features

EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used).

This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended:

```julia
julia> x = Vector{Union{Missing, Float64}}([1, 2])
2-element Vector{Union{Missing, Float64}}:
1.0
2.0

julia> identity.(x)
2-element Vector{Float64}:
1.0
2.0
```

For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing:

```julia
transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing)
```

Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model):

```julia
transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat);
```

For unordered categorical variables, a recode of the missing into a non missing level is sufficient:
```julia
julia> x = categorical(["a", "b", missing])
3-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
missing

julia> x = recode(x_cat_m1, missing => "missing value")
3-element CategoricalArray{String,1,UInt32}:
"a"
"b"
"missing value"
```

### Target

Target variable must have its element type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue`, `Integer`, `String` and `Char` are supported.

## Save/Load

```julia
Expand Down
6 changes: 3 additions & 3 deletions docs/src/tutorials/logistic-regression-titanic.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe

A first step in data processing is to prepare the input features in a model compatible format.

EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`.
EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables.
A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`.

For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).

```julia
# convert string feature to Categorical
Expand Down
5 changes: 4 additions & 1 deletion src/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())

Get the braking points of the feature data.
Get the histogram breaking points of the feature data.
"""
function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T}
@assert T <: Real
nobs = min(size(X, 1), 1000 * nbins)
idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
nfeats = size(X, 2)
Expand Down Expand Up @@ -80,6 +81,8 @@ function binarize(df; fnames, edges)
x_bin[:, j] .= levelcode.(col)
elseif eltype(col) <: Real
x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col)
else
@error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))"
end
end
return x_bin
Expand Down
9 changes: 8 additions & 1 deletion src/gpu/init.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o

target_levels = nothing
if L == Logistic
@assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
K = 1
y = T.(y_train)
μ = [logit(mean(y))]
!isnothing(offset) && (offset .= logit.(offset))
elseif L in [Poisson, Gamma, Tweedie]
@assert eltype(y_train) <: Real
K = 1
y = T.(y_train)
μ = fill(log(mean(y)), 1)
Expand All @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
if eltype(y_train) <: CategoricalValue
target_levels = CategoricalArrays.levels(y_train)
y = UInt32.(CategoricalArrays.levelcode.(y_train))
else
elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
target_levels = sort(unique(y_train))
yc = CategoricalVector(y_train, levels=target_levels)
y = UInt32.(CategoricalArrays.levelcode.(yc))
else
@error "Invalid target eltype: $(eltype(y_train))"
end
K = length(target_levels)
μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
μ .-= maximum(μ)
!isnothing(offset) && (offset .= log.(offset))
elseif L == GaussianMLE
@assert eltype(y_train) <: Real
K = 2
y = T.(y_train)
μ = [mean(y), log(std(y))]
!isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
elseif L == LogisticMLE
@assert eltype(y_train) <: Real
K = 2
y = T.(y_train)
μ = [mean(y), log(std(y) * sqrt(3) / π)]
!isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
else
@assert eltype(y_train) <: Real
K = 1
y = T.(y_train)
μ = [mean(y)]
Expand Down
Loading