Skip to content

Commit

Permalink
Merge pull request #261 from Evovest/debug-gpu
Browse files Browse the repository at this point in the history
Debug gpu
  • Loading branch information
jeremiedb authored Oct 13, 2023
2 parents 4458a63 + c7fe1f0 commit 535b0c4
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 1,034 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.16.3"
version = "0.16.4"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
202 changes: 202 additions & 0 deletions experiments/hist/perf-gpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
using Revise
using CUDA
using StatsBase: sample
using BenchmarkTools
using Base.Threads: @threads
using Random: seed!

"""
hist_kernel!
"""
function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
bdx, bdy = blockDim().z, blockDim().y
bix, biy = blockIdx().z, blockIdx().y
gdx = gridDim().z

j = tiy + bdy * (biy - 1)
if j <= length(js)
jdx = js[j]
i_max = length(is)
niter = cld(i_max, bdx * gdx)
@inbounds for iter = 1:niter
i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
if i <= i_max
@inbounds idx = is[i]
@inbounds bin = x_bin[idx, jdx]
hid = Base._to_linear_index(h∇, k, bin, jdx)
CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx]))
end
end
end
sync_threads()
return nothing
end

function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
CUDA.synchronize()
CUDA.@sync for j in jsc
nbins = size(h[j], 2)
copyto!(h[j], view(h∇, :, 1:nbins, j))
end
return nothing
end

function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
CUDA.synchronize()
return nothing
end

function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
copyto!(h∇_cpu, h∇)
CUDA.synchronize()
return nothing
end


function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
# CUDA.synchronize()
copyto!(h∇_cpu, h∇)
# CUDA.synchronize()
@threads for j in jsc
nbins = size(h[j], 2)
@views h[j] .= h∇_cpu[:, 1:nbins, j]
# h[j] .= h∇_cpu[:, 1:nbins, j]
end
return nothing
end


seed!(123)
nbins = 32
nfeats = 100
nobs = Int(1e6)
x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
= rand(Float32, 3, nobs);
h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats]
rowsample = 0.5
colsample = 0.5
is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)

∇_gpu = CuArray(∇)
x_bin_gpu = CuArray(x_bin)
h∇_cpu = zeros(Float32, 3, nbins, nfeats)
h∇_gpu = CuArray(h∇_cpu)
is_gpu = CuArray(is)
js_gpu = CuArray(js)

CUDA.allowscalar(false)
CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
# ref without copy to cpu: ~same
# ref 10K: 875.100 μs (168 allocations: 7.08 KiB)
# ref 100K: 1.236 ms (215 allocations: 9.91 KiB)
# ref 1M: 6.138 ms (227 allocations: 12.00 KiB)
# ref 10M: 67.075 ms (235 allocations: 13.38 KiB)

# with copy
# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB)
# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB)
@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)

# without copy
# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB)
# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)

# without single array copy
# CUDA v4 1M:
# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)

# without single array copy
# CUDA v4 1M:
# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)


using CUDA, BenchmarkTools
function gpu_copy!(h, h∇, jsc)
CUDA.@sync for j in jsc
nbins = size(h[j], 2)
copyto!(h[j], view(h∇, :, 1:nbins, j))
end
return nothing
end

h∇ = [zeros(Float32, 3, 32) for n in 1:100];
h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
js = 1:100

# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
@btime gpu_copy!(h∇, h∇_gpu, js)


function gpu_copy2!(h, h∇, jsc)
for j in jsc
nbins = size(h[j], 2)
@async copyto!(h[j], view(h∇, :, 1:nbins, j))
end
return nothing
end

h∇ = [zeros(Float32, 3, 32) for n in 1:100];
h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
js = 1:100

# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
@btime gpu_copy2!(h∇, h∇_gpu, js)
Loading

2 comments on commit 535b0c4

@jeremiedb
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/93323

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.16.4 -m "<description of version>" 535b0c47e7acb726e5d6591588deeceecadbacbe
git push origin v0.16.4

Please sign in to comment.