diff --git a/Project.toml b/Project.toml
index 784acf1d..4c7c1365 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+EvoTreesCUDAExt = "CUDA"
+
 [compat]
 BSON = "0.3"
 CUDA = "3.0, 4.0, 5.0"
@@ -29,6 +35,7 @@ Tables = "1.9"
 julia = "1.6"
 
 [extras]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
@@ -37,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 docs = ["Documenter"]
-test = ["DataFrames", "Test", "MLJBase", "MLJTestInterface"]
+test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]
diff --git a/docs/src/index.md b/docs/src/index.md
index 88fbff39..9e73aa74 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -68,6 +68,12 @@ m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);
 
 ### GPU Acceleration
 
+EvoTrees supports training and inference on Nvidia GPU's with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
+Note that on Julia ≥ 1.9 CUDA support is only enabled when CUDA.jl is installed and loaded, by another package or explicitly with e.g.
+```julia
+using CUDA
+```
+
 If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg: 
 
 ```julia
diff --git a/docs/src/internals.md b/docs/src/internals.md
index cd1b6fe4..9de3999a 100644
--- a/docs/src/internals.md
+++ b/docs/src/internals.md
@@ -19,7 +19,6 @@ EvoTrees.update_gains!
 EvoTrees.predict!
 EvoTrees.subsample
 EvoTrees.split_set_chunk!
-EvoTrees.split_chunk_kernel!
 ```
 
 ## Histogram
@@ -28,7 +27,4 @@ EvoTrees.split_chunk_kernel!
 EvoTrees.get_edges
 EvoTrees.binarize
 EvoTrees.update_hist!
-EvoTrees.hist_kernel!
-EvoTrees.hist_kernel_vec!
-EvoTrees.predict_kernel!
 ```
diff --git a/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl b/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl
new file mode 100644
index 00000000..8d0f7a8c
--- /dev/null
+++ b/ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl
@@ -0,0 +1,22 @@
+module EvoTreesCUDAExt
+
+using EvoTrees
+using CUDA
+
+# This should be different on CPUs and GPUs
+EvoTrees.device_ones(::Type{<:EvoTrees.GPU}, ::Type{T}, n::Int) where {T} = CUDA.ones(T, n)
+EvoTrees.device_array_type(::Type{<:EvoTrees.GPU}) = CuArray
+function EvoTrees.post_fit_gc(::Type{<:EvoTrees.GPU})
+    GC.gc(true)
+    CUDA.reclaim()
+end
+
+include("loss.jl")
+include("eval.jl")
+include("predict.jl")
+include("init.jl")
+include("subsample.jl")
+include("fit-utils.jl")
+include("fit.jl")
+
+end # module
diff --git a/src/gpu/eval.jl b/ext/EvoTreesCUDAExt/eval.jl
similarity index 80%
rename from src/gpu/eval.jl
rename to ext/EvoTreesCUDAExt/eval.jl
index be12b94d..6ea10153 100644
--- a/src/gpu/eval.jl
+++ b/ext/EvoTreesCUDAExt/eval.jl
@@ -8,7 +8,7 @@ function eval_mse_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
     end
     return nothing
 end
-function mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mse_kernel!(eval, p, y, w)
@@ -19,8 +19,8 @@ end
 ########################
 # RMSE
 ########################
-rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
-    sqrt(rmse(p, y, w; MAX_THREADS, kwargs...))
+EvoTrees.rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
+    sqrt(EvoTrees.rmse(p, y, w; MAX_THREADS, kwargs...))
 
 ########################
 # MAE
@@ -32,7 +32,7 @@ function eval_mae_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
     end
     return nothing
 end
-function mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mae_kernel!(eval, p, y, w)
@@ -51,7 +51,7 @@ function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     end
     return nothing
 end
-function logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_logloss_kernel!(eval, p, y, w)
@@ -70,7 +70,7 @@ function eval_gaussian_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
     end
     return nothing
 end
-function gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_gaussian_kernel!(eval, p, y, w)
@@ -91,7 +91,7 @@ function eval_poisson_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     return nothing
 end
 
-function poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_poisson_kernel!(eval, p, y, w)
@@ -111,7 +111,7 @@ function eval_gamma_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::Cu
     return nothing
 end
 
-function gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_gamma_kernel!(eval, p, y, w)
@@ -133,7 +133,7 @@ function eval_tweedie_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
     return nothing
 end
 
-function tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_tweedie_kernel!(eval, p, y, w)
@@ -158,10 +158,10 @@ function eval_mlogloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
     return nothing
 end
 
-function mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
+function EvoTrees.mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads eval_mlogloss_kernel!(eval, p, y, w)
     CUDA.synchronize()
     return sum(eval) / sum(w)
-end
\ No newline at end of file
+end
diff --git a/src/gpu/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl
similarity index 95%
rename from src/gpu/fit-utils.jl
rename to ext/EvoTreesCUDAExt/fit-utils.jl
index d4d285b6..31bc1f55 100644
--- a/src/gpu/fit-utils.jl
+++ b/ext/EvoTreesCUDAExt/fit-utils.jl
@@ -1,6 +1,3 @@
-"""
-    hist_kernel!
-"""
 function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
     tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
     bdx, bdy = blockDim().z, blockDim().y
@@ -48,9 +45,6 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
     return nothing
 end
 
-"""
-    hist_kernel_vec!
-"""
 function hist_kernel_vec!(h∇, ∇, x_bin, is)
     tix, k = threadIdx().x, threadIdx().y
     bdx = blockDim().x
@@ -103,10 +97,8 @@ function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector)
     return nothing
 end
 
-"""
-    Multi-threads split_set!
-        Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
-"""
+# Multi-threads split_set!
+# Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
 function split_chunk_kernel!(
     left::CuDeviceVector{S},
     right::CuDeviceVector{S},
@@ -149,7 +141,7 @@ function split_chunk_kernel!(
     return nothing
 end
 
-function split_views_kernel!(
+function EvoTrees.split_views_kernel!(
     out::CuDeviceVector{S},
     left::CuDeviceVector{S},
     right::CuDeviceVector{S},
@@ -208,7 +200,7 @@ function split_set_threads_gpu!(out, left, right, is, x_bin, feat, cond_bin, fea
     sum_lefts = sum(lefts)
     cumsum_lefts = cumsum(lefts)
     cumsum_rights = cumsum(rights)
-    @cuda blocks = nblocks threads = 1 split_views_kernel!(
+    @cuda blocks = nblocks threads = 1 EvoTrees.split_views_kernel!(
         out,
         left,
         right,
diff --git a/src/gpu/fit.jl b/ext/EvoTreesCUDAExt/fit.jl
similarity index 82%
rename from src/gpu/fit.jl
rename to ext/EvoTreesCUDAExt/fit.jl
index b0935451..f0e32cf0 100644
--- a/src/gpu/fit.jl
+++ b/ext/EvoTreesCUDAExt/fit.jl
@@ -1,15 +1,15 @@
-function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type{GPU}) where {L,K}
+function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}) where {L,K}
 
     # compute gradients
-    update_grads!(cache.∇, cache.pred, cache.y, params)
+    EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, params)
     # subsample rows
     cache.nodes[1].is =
-        subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
+        EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
     # subsample cols
-    sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
+    EvoTrees.sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
 
     # assign a root and grow tree
-    tree = Tree{L,K}(params.max_depth)
+    tree = EvoTrees.Tree{L,K}(params.max_depth)
     grow! = params.tree_type == "oblivious" ? grow_otree! : grow_tree!
     grow!(
         tree,
@@ -27,16 +27,16 @@ function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type
         cache.monotone_constraints,
     )
     push!(evotree.trees, tree)
-    predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
+    EvoTrees.predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
     cache[:info][:nrounds] += 1
     return nothing
 end
 
 # grow a single binary tree - grow through all depth
 function grow_tree!(
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     nodes::Vector{N},
-    params::EvoTypes{L},
+    params::EvoTrees.EvoTypes{L},
     ∇::CuMatrix,
     edges,
     js,
@@ -66,7 +66,7 @@ function grow_tree!(
 
     # initialize summary stats
     nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
-    nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
+    nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?
 
     # grow while there are remaining active nodes
     while length(n_current) > 0 && depth <= params.max_depth
@@ -90,14 +90,14 @@ function grow_tree!(
                     update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
-            @threads for n ∈ sort(n_current)
-                update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
+            Threads.@threads for n ∈ sort(n_current)
+                EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
             end
         end
 
         for n ∈ sort(n_current)
             if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight
-                pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
             else
                 best = findmax(findmax.(nodes[n].gains))
                 best_gain = best[1][1]
@@ -126,8 +126,8 @@ function grow_tree!(
                     nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
                     nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
                     nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
-                    nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
-                    nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
+                    nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
+                    nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)
 
                     if length(_right) >= length(_left)
                         push!(n_next, n << 1)
@@ -137,7 +137,7 @@ function grow_tree!(
                         push!(n_next, n << 1)
                     end
                 else
-                    pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                    EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
                 end
             end
         end
@@ -151,9 +151,9 @@ end
 
 # grow a single oblivious tree - grow through all depth
 function grow_otree!(
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     nodes::Vector{N},
-    params::EvoTypes{L},
+    params::EvoTrees.EvoTypes{L},
     ∇::CuMatrix,
     edges,
     js,
@@ -183,7 +183,7 @@ function grow_otree!(
 
     # initialize summary stats
     nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
-    nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
+    nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?
 
     # grow while there are remaining active nodes
     while length(n_current) > 0 && depth <= params.max_depth
@@ -197,7 +197,7 @@ function grow_otree!(
         if depth == params.max_depth || min_weight_flag
             for n in n_current
                 # @info "length(nodes[n].is)" length(nodes[n].is) depth n
-                pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
             end
         else
             # update histograms
@@ -217,8 +217,8 @@ function grow_otree!(
                     update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
-            @threads for n ∈ n_current
-                update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
+            Threads.@threads for n ∈ n_current
+                EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
             end
 
             # initialize gains for node 1 in which all gains of a given depth will be accumulated
@@ -273,8 +273,8 @@ function grow_otree!(
                     nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
                     nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
                     nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
-                    nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
-                    nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
+                    nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
+                    nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)
 
                     if length(_right) >= length(_left)
                         push!(n_next, n << 1)
@@ -286,7 +286,7 @@ function grow_otree!(
                 end
             else
                 for n in n_current
-                    pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
+                    EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
                 end
             end
         end
@@ -295,4 +295,4 @@ function grow_otree!(
     end # end of loop over current nodes for a given depth
 
     return nothing
-end
\ No newline at end of file
+end
diff --git a/src/gpu/init.jl b/ext/EvoTreesCUDAExt/init.jl
similarity index 69%
rename from src/gpu/init.jl
rename to ext/EvoTreesCUDAExt/init.jl
index 3c3d682d..6a8dfcda 100644
--- a/src/gpu/init.jl
+++ b/ext/EvoTreesCUDAExt/init.jl
@@ -1,56 +1,56 @@
-function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, offset) where {L}
+function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}, data, fnames, y_train, w, offset) where {L}
 
     # binarize data into quantiles
-    edges, featbins, feattypes = get_edges(data; fnames, nbins=params.nbins, rng=params.rng)
-    x_bin = CuArray(binarize(data; fnames, edges))
+    edges, featbins, feattypes = EvoTrees.get_edges(data; fnames, nbins=params.nbins, rng=params.rng)
+    x_bin = CuArray(EvoTrees.binarize(data; fnames, edges))
     nobs, nfeats = size(x_bin)
     T = Float32
 
     target_levels = nothing
-    if L == Logistic
+    if L == EvoTrees.Logistic
         @assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
         K = 1
         y = T.(y_train)
-        μ = [logit(mean(y))]
-        !isnothing(offset) && (offset .= logit.(offset))
-    elseif L in [Poisson, Gamma, Tweedie]
+        μ = [EvoTrees.logit(EvoTrees.mean(y))]
+        !isnothing(offset) && (offset .= EvoTrees.logit.(offset))
+    elseif L in [EvoTrees.Poisson, EvoTrees.Gamma, EvoTrees.Tweedie]
         @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
-        μ = fill(log(mean(y)), 1)
+        μ = fill(log(EvoTrees.mean(y)), 1)
         !isnothing(offset) && (offset .= log.(offset))
-    elseif L == MLogLoss
-        if eltype(y_train) <: CategoricalValue
-            target_levels = CategoricalArrays.levels(y_train)
-            y = UInt32.(CategoricalArrays.levelcode.(y_train))
+    elseif L == EvoTrees.MLogLoss
+        if eltype(y_train) <: EvoTrees.CategoricalValue
+            target_levels = EvoTrees.CategoricalArrays.levels(y_train)
+            y = UInt32.(EvoTrees.CategoricalArrays.levelcode.(y_train))
         elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
             target_levels = sort(unique(y_train))
-            yc = CategoricalVector(y_train, levels=target_levels)
-            y = UInt32.(CategoricalArrays.levelcode.(yc))
+            yc = EvoTrees.CategoricalVector(y_train, levels=target_levels)
+            y = UInt32.(EvoTrees.CategoricalArrays.levelcode.(yc))
         else
             @error "Invalid target eltype: $(eltype(y_train))"
         end
         K = length(target_levels)
-        μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
+        μ = T.(log.(EvoTrees.proportions(y, UInt32(1):UInt32(K))))
         μ .-= maximum(μ)
         !isnothing(offset) && (offset .= log.(offset))
-    elseif L == GaussianMLE
+    elseif L == EvoTrees.GaussianMLE
         @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
-        μ = [mean(y), log(std(y))]
+        μ = [EvoTrees.mean(y), log(EvoTrees.std(y))]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
-    elseif L == LogisticMLE
+    elseif L == EvoTrees.LogisticMLE
         @assert eltype(y_train) <: Real
         K = 2
         y = T.(y_train)
-        μ = [mean(y), log(std(y) * sqrt(3) / π)]
+        μ = [EvoTrees.mean(y), log(EvoTrees.std(y) * sqrt(3) / π)]
         !isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
     else
         @assert eltype(y_train) <: Real
         K = 1
         y = T.(y_train)
-        μ = [mean(y)]
+        μ = [EvoTrees.mean(y)]
     end
     y = CuArray(y)
     μ = T.(μ)
@@ -94,8 +94,8 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
     )
 
     # initialize model
-    nodes = [TrainNode(featbins, K, view(is_in, 1:0)) for n = 1:2^params.max_depth-1]
-    bias = [Tree{L,K}(μ)]
+    nodes = [EvoTrees.TrainNode(featbins, K, view(is_in, 1:0)) for n = 1:2^params.max_depth-1]
+    bias = [EvoTrees.Tree{L,K}(μ)]
     m = EvoTree{L,K}(bias, info)
 
     # build cache
@@ -125,4 +125,4 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
         monotone_constraints=monotone_constraints,
     )
     return m, cache
-end
\ No newline at end of file
+end
diff --git a/src/gpu/loss.jl b/ext/EvoTreesCUDAExt/loss.jl
similarity index 89%
rename from src/gpu/loss.jl
rename to ext/EvoTreesCUDAExt/loss.jl
index bad91534..0f6f4e4d 100644
--- a/src/gpu/loss.jl
+++ b/ext/EvoTreesCUDAExt/loss.jl
@@ -9,13 +9,13 @@ function kernel_mse_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVect
     end
     return
 end
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeRegressor{L};
+    ::EvoTreeRegressor{<:EvoTrees.MSE};
     MAX_THREADS=1024
-) where {L<:MSE}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_mse_∇!(∇, p, y)
@@ -29,19 +29,19 @@ end
 function kernel_logloss_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVector)
     i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
     if i <= length(y)
-        @inbounds pred = sigmoid(p[1, i])
+        @inbounds pred = EvoTrees.sigmoid(p[1, i])
         @inbounds ∇[1, i] = (pred - y[i]) * ∇[3, i]
         @inbounds ∇[2, i] = pred * (1 - pred) * ∇[3, i]
     end
     return
 end
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeRegressor{L};
+    ::EvoTreeRegressor{<:EvoTrees.LogLoss};
     MAX_THREADS=1024
-) where {L<:LogLoss}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_logloss_∇!(∇, p, y)
@@ -61,13 +61,13 @@ function kernel_poisson_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDevice
     end
     return
 end
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeCount{L};
+    ::EvoTreeCount{<:EvoTrees.Poisson};
     MAX_THREADS=1024
-) where {L<:Poisson}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_poisson_∇!(∇, p, y)
@@ -87,13 +87,13 @@ function kernel_gamma_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVe
     end
     return
 end
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeRegressor{L};
+    ::EvoTreeRegressor{<:EvoTrees.Gamma};
     MAX_THREADS=1024
-) where {L<:Gamma}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_gamma_∇!(∇, p, y)
@@ -115,13 +115,13 @@ function kernel_tweedie_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDevice
     end
     return
 end
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeRegressor{L};
+    ::EvoTreeRegressor{<:EvoTrees.Tweedie};
     MAX_THREADS=1024
-) where {L<:Tweedie}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_tweedie_∇!(∇, p, y)
@@ -154,13 +154,13 @@ function kernel_mlogloss_∇!(∇::CuDeviceMatrix{T}, p::CuDeviceMatrix{T}, y::C
     return
 end
 
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::EvoTreeClassifier{L};
+    ::EvoTreeClassifier{<:EvoTrees.MLogLoss};
     MAX_THREADS=1024
-) where {L<:MLogLoss}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_mlogloss_∇!(∇, p, y)
@@ -187,16 +187,16 @@ function kernel_gauss_∇!(∇::CuDeviceMatrix, p::CuDeviceMatrix, y::CuDeviceVe
     return
 end
 
-function update_grads!(
+function EvoTrees.update_grads!(
     ∇::CuMatrix,
     p::CuMatrix,
     y::CuVector,
-    ::Union{EvoTreeGaussian{L},EvoTreeMLE{L}};
+    ::Union{EvoTreeGaussian{<:EvoTrees.GaussianMLE},EvoTreeMLE{<:EvoTrees.GaussianMLE}};
     MAX_THREADS=1024
-) where {L<:GaussianMLE}
+)
     threads = min(MAX_THREADS, length(y))
     blocks = cld(length(y), threads)
     @cuda blocks = blocks threads = threads kernel_gauss_∇!(∇, p, y)
     CUDA.synchronize()
     return
-end
\ No newline at end of file
+end
diff --git a/src/gpu/predict.jl b/ext/EvoTreesCUDAExt/predict.jl
similarity index 84%
rename from src/gpu/predict.jl
rename to ext/EvoTreesCUDAExt/predict.jl
index 9cad7c3b..6ae81645 100644
--- a/src/gpu/predict.jl
+++ b/ext/EvoTreesCUDAExt/predict.jl
@@ -1,6 +1,3 @@
-"""
-    predict_kernel!
-"""
 function predict_kernel!(
     ::Type{L},
     pred::CuDeviceMatrix{T},
@@ -28,12 +25,9 @@ function predict_kernel!(
     return nothing
 end
 
-"""
-    predict_kernel!
-        GradientRegression
-"""
+# GradientRegression
 function predict_kernel!(
-    ::Type{L},
+    ::Type{<:EvoTrees.GradientRegression},
     pred::CuDeviceMatrix{T},
     split,
     feats,
@@ -41,7 +35,7 @@ function predict_kernel!(
     leaf_pred,
     x_bin,
     feattypes,
-) where {L<:GradientRegression,T}
+) where {T}
     i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
     nid = 1
     @inbounds if i <= size(pred, 2)
@@ -56,12 +50,9 @@ function predict_kernel!(
     return nothing
 end
 
-"""
-    predict_kernel!
-        Logistic
-"""
+# Logistic
 function predict_kernel!(
-    ::Type{L},
+    ::Type{<:EvoTrees.LogLoss},
     pred::CuDeviceMatrix{T},
     split,
     feats,
@@ -69,7 +60,7 @@ function predict_kernel!(
     leaf_pred,
     x_bin,
     feattypes,
-) where {L<:LogLoss,T}
+) where {T}
     i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
     nid = 1
     @inbounds if i <= size(pred, 2)
@@ -84,12 +75,9 @@ function predict_kernel!(
     return nothing
 end
 
-"""
-    predict_kernel!
-        MLE2P
-"""
+# MLE2P
 function predict_kernel!(
-    ::Type{L},
+    ::Type{<:EvoTrees.MLE2P},
     pred::CuDeviceMatrix{T},
     split,
     feats,
@@ -97,7 +85,7 @@ function predict_kernel!(
     leaf_pred,
     x_bin,
     feattypes,
-) where {L<:MLE2P,T}
+) where {T}
     i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
     nid = 1
     @inbounds if i <= size(pred, 2)
@@ -114,9 +102,9 @@ function predict_kernel!(
 end
 
 # prediction from single tree - assign each observation to its final leaf
-function predict!(
+function EvoTrees.predict!(
     pred::CuMatrix{T},
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     x_bin::CuMatrix,
     feattypes::CuVector{Bool};
     MAX_THREADS=1024
@@ -137,13 +125,13 @@ function predict!(
     CUDA.synchronize()
 end
 
-function predict!(
+function EvoTrees.predict!(
     pred::CuMatrix{T},
-    tree::Tree{L,K},
+    tree::EvoTrees.Tree{L,K},
     x_bin::CuMatrix,
     feattypes::CuVector{Bool};
     MAX_THREADS=1024
-) where {L<:MLogLoss,K,T}
+) where {L<:EvoTrees.MLogLoss,K,T}
     n = size(pred, 2)
     threads = min(MAX_THREADS, n)
     blocks = cld(n, threads)
@@ -165,25 +153,25 @@ end
 function predict(
     m::EvoTree{L,K},
     data,
-    ::Type{GPU};
+    ::Type{<:EvoTrees.GPU};
     ntree_limit=length(m.trees)) where {L,K}
 
     pred = CUDA.zeros(K, size(data, 1))
     ntrees = length(m.trees)
     ntree_limit > ntrees && error("ntree_limit is larger than number of trees $ntrees.")
-    x_bin = CuArray(binarize(data; fnames=m.info[:fnames], edges=m.info[:edges]))
+    x_bin = CuArray(EvoTrees.binarize(data; fnames=m.info[:fnames], edges=m.info[:edges]))
     feattypes = CuArray(m.info[:feattypes])
     for i = 1:ntree_limit
-        predict!(pred, m.trees[i], x_bin, feattypes)
+        EvoTrees.predict!(pred, m.trees[i], x_bin, feattypes)
     end
-    if L == LogLoss
-        pred .= sigmoid.(pred)
-    elseif L ∈ [Poisson, Gamma, Tweedie]
+    if L == EvoTrees.LogLoss
+        pred .= EvoTrees.sigmoid.(pred)
+    elseif L ∈ [EvoTrees.Poisson, EvoTrees.Gamma, EvoTrees.Tweedie]
         pred .= exp.(pred)
-    elseif L in [GaussianMLE, LogisticMLE]
+    elseif L in [EvoTrees.GaussianMLE, EvoTrees.LogisticMLE]
         pred[2, :] .= exp.(pred[2, :])
-    elseif L == MLogLoss
-        softmax!(pred)
+    elseif L == EvoTrees.MLogLoss
+        EvoTrees.softmax!(pred)
     end
     pred = K == 1 ? vec(Array(pred')) : Array(pred')
     return pred
@@ -205,7 +193,7 @@ function softmax_kernel!(p::CuDeviceMatrix{T}) where {T}
     return nothing
 end
 
-function softmax!(p::CuMatrix{T}; MAX_THREADS=1024) where {T}
+function EvoTrees.softmax!(p::CuMatrix{T}; MAX_THREADS=1024) where {T}
     K, nobs = size(p)
     threads = min(MAX_THREADS, nobs)
     blocks = cld(nobs, threads)
diff --git a/src/gpu/subsample.jl b/ext/EvoTreesCUDAExt/subsample.jl
similarity index 95%
rename from src/gpu/subsample.jl
rename to ext/EvoTreesCUDAExt/subsample.jl
index 2120347e..4328485f 100644
--- a/src/gpu/subsample.jl
+++ b/ext/EvoTreesCUDAExt/subsample.jl
@@ -51,7 +51,7 @@ function subsample_step_2_kernel(is_in, is_out, counts, counts_cum, chunk_size)
     sync_threads()
 end
 
-function subsample(is_in::CuVector, is_out::CuVector, mask::CuVector, rowsample::AbstractFloat, rng)
+function EvoTrees.subsample(is_in::CuVector, is_out::CuVector, mask::CuVector, rowsample::AbstractFloat, rng)
     get_rand_gpu!(mask)
     cond = round(UInt8, 255 * rowsample)
     chunk_size = cld(length(is_in), min(cld(length(is_in), 128), 2048))
diff --git a/src/EvoTrees.jl b/src/EvoTrees.jl
index 9fb52b1a..62fa751d 100644
--- a/src/EvoTrees.jl
+++ b/src/EvoTrees.jl
@@ -18,8 +18,6 @@ using Distributions
 using Tables
 using CategoricalArrays
 using Tables
-using CUDA
-using CUDA: @allowscalar, allowscalar
 using BSON
 
 using NetworkLayout
@@ -41,13 +39,9 @@ include("subsample.jl")
 include("fit-utils.jl")
 include("fit.jl")
 
-include("gpu/loss.jl")
-include("gpu/eval.jl")
-include("gpu/predict.jl")
-include("gpu/init.jl")
-include("gpu/subsample.jl")
-include("gpu/fit-utils.jl")
-include("gpu/fit.jl")
+if !isdefined(Base, :get_extension)
+    include("../ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl")
+end
 
 include("callback.jl")
 include("importance.jl")
diff --git a/src/callback.jl b/src/callback.jl
index 6d12b4fe..5d9d6220 100644
--- a/src/callback.jl
+++ b/src/callback.jl
@@ -44,7 +44,8 @@ function CallBack(
     else
         y = T.(y_eval)
     end
-    w = isnothing(w_name) ? ones(T, size(y)) : Vector{T}(Tables.getcolumn(deval, _w_name))
+    V = device_array_type(device)
+    w = isnothing(w_name) ? device_ones(device, T, length(y)) : V{T}(Tables.getcolumn(deval, _w_name))
 
     offset = !isnothing(offset_name) ? T.(Tables.getcolumn(deval, _offset_name)) : nothing
     if !isnothing(offset)
@@ -56,11 +57,7 @@ function CallBack(
         p .+= offset'
     end
 
-    if device <: GPU
-        return CallBack(feval, CuArray(x_bin), CuArray(p), CuArray(y), CuArray(w), CuArray(similar(w)), CuArray(m.info[:feattypes]))
-    else
-        return CallBack(feval, x_bin, p, y, w, similar(w), m.info[:feattypes])
-    end
+    return CallBack(feval, convert(V, x_bin), convert(V, p), convert(V, y), w, similar(w), convert(V, m.info[:feattypes]))
 end
 
 function CallBack(
@@ -92,7 +89,8 @@ function CallBack(
     else
         y = T.(y_eval)
     end
-    w = isnothing(w_eval) ? ones(T, size(y)) : Vector{T}(w_eval)
+    V = device_array_type(device)
+    w = isnothing(w_eval) ? device_ones(device, T, length(y)) : V{T}(w_eval)
 
     offset = !isnothing(offset_eval) ? T.(offset_eval) : nothing
     if !isnothing(offset)
@@ -104,11 +102,7 @@ function CallBack(
         p .+= offset'
     end
 
-    if device <: GPU
-        return CallBack(feval, CuArray(x_bin), CuArray(p), CuArray(y), CuArray(w), CuArray(similar(w)), CuArray(m.info[:feattypes]))
-    else
-        return CallBack(feval, x_bin, p, y, w, similar(w), m.info[:feattypes])
-    end
+    return CallBack(feval, convert(V, x_bin), convert(V, p), convert(V, y), w, similar(w), convert(V, m.info[:feattypes]))
 end
 
 function (cb::CallBack)(logger, iter, tree)
@@ -149,4 +143,4 @@ function update_logger!(logger, iter, metric)
             logger[:iter_since_best] += logger[:iter][end] - logger[:iter][end-1]
         end
     end
-end
\ No newline at end of file
+end
diff --git a/src/fit.jl b/src/fit.jl
index 4628b4f7..feb07d7a 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -397,10 +397,7 @@ function fit_evotree(
             (logger[:iter_since_best] >= logger[:early_stopping_rounds]) && break
         end
     end
-    if String(device) == "gpu"
-        GC.gc(true)
-        CUDA.reclaim()
-    end
+    post_fit_gc(_device)
 
     if return_logger
         return (m, logger)
@@ -410,6 +407,8 @@ function fit_evotree(
 
 end
 
+# A no-op on the CPU, but on the GPU we perform garbage collection
+post_fit_gc(::Type{<:CPU}) = nothing
 
 """
     fit_evotree(
@@ -517,10 +516,7 @@ function fit_evotree(
             (logger[:iter_since_best] >= logger[:early_stopping_rounds]) && break
         end
     end
-    if _device <: GPU
-        GC.gc(true)
-        CUDA.reclaim()
-    end
+    post_fit_gc(_device)
 
     if return_logger
         return (m, logger)
diff --git a/src/init.jl b/src/init.jl
index f9930f97..1c77f6ee 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -173,19 +173,18 @@ function init(
     T = Float32
     nobs = length(Tables.getcolumn(dtrain, 1))
     y_train = Tables.getcolumn(dtrain, _target_name)
-    if device <: GPU
-        w = isnothing(w_name) ? CUDA.ones(T, nobs) : CuArray{T}(Tables.getcolumn(dtrain, _w_name))
-        offset = !isnothing(offset_name) ? CuArray{T}(Tables.getcolumn(dtrain, _offset_name)) : nothing
-    else
-        w = isnothing(w_name) ? ones(T, nobs) : Vector{T}(Tables.getcolumn(dtrain, _w_name))
-        offset = !isnothing(offset_name) ? T.(Tables.getcolumn(dtrain, _offset_name)) : nothing
-    end
+    V = device_array_type(device)
+    w = isnothing(w_name) ? device_ones(device, T, nobs) : V{T}(Tables.getcolumn(dtrain, _w_name))
+    offset = isnothing(offset_name) ? nothing : V{T}(Tables.getcolumn(dtrain, _offset_name))
 
     m, cache = init_core(params, device, dtrain, fnames, y_train, w, offset)
 
     return m, cache
 end
 
+# This should be different on CPUs and GPUs
+device_ones(::Type{<:CPU}, ::Type{T}, n::Int) where {T} = ones(T, n)
+device_array_type(::Type{<:CPU}) = Array
 
 """
     init(
@@ -216,13 +215,9 @@ function init(
 
     T = Float32
     nobs = size(x_train, 1)
-    if device <: GPU
-        w = isnothing(w_train) ? CUDA.ones(T, nobs) : CuArray{T}(w_train)
-        offset = !isnothing(offset_train) ? CuArray{T}(offset_train) : nothing
-    else
-        w = isnothing(w_train) ? ones(T, nobs) : Vector{T}(w_train)
-        offset = !isnothing(offset_train) ? T.(offset_train) : nothing
-    end
+    V = device_array_type(device)
+    w = isnothing(w_train) ? device_ones(device, T, nobs) : V{T}(w_train)
+    offset = isnothing(offset_train) ? nothing : V{T}(offset_train)
 
     m, cache = init_core(params, device, x_train, fnames, y_train, w, offset)