Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU support #56

Merged
merged 8 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ authors = ["Patrick Sanan <[email protected]>, Albert De Montserrat <alber
version = "0.1.0"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CellArrays = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
GeoParams = "e018b62d-d9de-4a26-8697-af89c310ae38"
Expand All @@ -16,7 +17,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"

[compat]
Expand Down
2 changes: 1 addition & 1 deletion src/IO/DataIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module DataIO
using WriteVTK
using HDF5
using MPI
using CUDA
using CUDA, AMDGPU

import ..JustRelax: Geometry

Expand Down
1 change: 1 addition & 0 deletions src/IO/H5.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ end

_tocpu(x) = x
_tocpu(x::T) where {T<:CuArray} = Array(x)
_tocpu(x::T) where {T<:ROCArray} = Array(x)

"""
checkpointing(dst, stokes, T, η, time)
Expand Down
2 changes: 1 addition & 1 deletion src/JustRelax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using Reexport
@reexport using ImplicitGlobalGrid
using LinearAlgebra
using Printf
using CUDA
using CUDA, AMDGPU
using MPI
using GeoParams
using HDF5
Expand Down
11 changes: 9 additions & 2 deletions src/MetaJustRelax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@ struct PS_Setup{B,C}
end

function environment!(model::PS_Setup{T,N}) where {T,N}
gpu = model.device == :gpu ? true : false

# call appropriate FD module
Base.eval(@__MODULE__, Meta.parse("using ParallelStencil.FiniteDifferences$(N)D"))
Base.eval(Main, Meta.parse("using ParallelStencil.FiniteDifferences$(N)D"))

# start ParallelStencil
if model.device == :gpu
if model.device == :CUDA
eval(:(@init_parallel_stencil(CUDA, $T, $N)))
Base.eval(Main, Meta.parse("using CUDA"))
if !isconst(Main, :PTArray)
eval(:(const PTArray = CUDA.CuArray{$T,$N,CUDA.Mem.DeviceBuffer}))
end

elseif model.device == :AMDGPU
eval(:(@init_parallel_stencil(AMDGPU, $T, $N)))
Base.eval(Main, Meta.parse("using AMDGPU"))
if !isconst(Main, :PTArray)
eval(:(const PTArray = AMDGPU.ROCArray{$T,$N,AMDGPU.Runtime.Mem.HIPBuffer}))
end

else
@eval begin
@init_parallel_stencil(Threads, $T, $N)
Expand Down
56 changes: 17 additions & 39 deletions src/Utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -291,44 +291,13 @@ Compute the maximum value of `A` in the `window = (width_x, width_y, width_z)` a
"""
function compute_maxloc!(B, A; window=(1, 1, 1))
ni = size(A)
width_x, width_y, width_z = window

@parallel_indices (i, j) function _maxloc!(
B::T, A::T
) where {T<:AbstractArray{<:Number,2}}
B[i, j] = _maxloc_window_clamped(A, i, j, width_x, width_y)
@parallel_indices (I...) function _maxloc!(B, A, window)
B[I...] = _maxloc_window_clamped(A, I..., window...)
return nothing
end

@parallel_indices (i, j, k) function _maxloc!(
B::T, A::T
) where {T<:AbstractArray{<:Number,3}}
B[i, j, k] = _maxloc_window_clamped(A, i, j, k, width_x, width_y, width_z)
return nothing
end

@parallel (@idx ni) _maxloc!(B, A)
end

function _compute_maxloc!(B, A, window)
I = indices(window)

if all(I .<= size(A))
B[I...] = JustRelax._maxloc_window_clamped(A, I..., window...)
end

return nothing
end

function compute_maxloc!(
B::CuArray{T1,N,T2}, A::CuArray{T1,N,T2}; window=ntuple(i -> 1, Val(N))
) where {T1,T2,N}
nx, ny = size(A)
ntx, nty = 32, 32
blkx, blky = ceil(Int, nx / ntx), ceil(Int, ny / nty)
CUDA.@sync begin
@cuda threads = (ntx, nty) blocks = (blkx, blky) _compute_maxloc!(B, A, window)
end
@parallel (@idx ni) _maxloc!(B, A, window)
return nothing
end

Expand Down Expand Up @@ -404,7 +373,7 @@ Compute the time step `dt` for the velocity field `S.V` and the diffusive maximu

@inline function compute_dt(V::NTuple, di, dt_diff)
n = inv(length(V) + 0.1)
dt_adv = mapreduce(x -> x[1] * inv(maximum(abs.(x[2]))), min, zip(di, V)) * n
dt_adv = mapreduce(x -> x[1] * inv(maximum_mpi(abs.(x[2]))), min, zip(di, V)) * n
return min(dt_diff, dt_adv)
end
"""
Expand Down Expand Up @@ -454,21 +423,30 @@ end
# MPI reductions

function mean_mpi(A)
mean_l = mean(A)
mean_l = _mean(A)
return MPI.Allreduce(mean_l, MPI.SUM, MPI.COMM_WORLD) / MPI.Comm_size(MPI.COMM_WORLD)
end

function norm_mpi(A)
sum2_l = sum(A .^ 2)
sum2_l = _sum(A .^ 2)
return sqrt(MPI.Allreduce(sum2_l, MPI.SUM, MPI.COMM_WORLD))
end

function minimum_mpi(A)
min_l = minimum(A)
min_l = _minimum(A)
return MPI.Allreduce(min_l, MPI.MIN, MPI.COMM_WORLD)
end

function maximum_mpi(A)
max_l = maximum(A)
max_l = _maximum(A)
return MPI.Allreduce(max_l, MPI.MAX, MPI.COMM_WORLD)
end

for (f1, f2) in zip(
(:_mean, :_norm, :_minimum, :_maximum, :_sum), (:mean, :norm, :minimum, :maximum, :sum)
)
@eval begin
$f1(A::ROCArray) = $f2(Array(A))
$f1(A) = $f2(A)
end
end
7 changes: 5 additions & 2 deletions src/phases/CallArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,11 @@ end
## Fallbacks
import Base: getindex, setindex!

@inline element(A::Union{Array,CuArray}, I::Vararg{Int,N}) where {N} = getindex(A, I...)
@inline function setelement!(A::Union{Array,CuArray}, x::Number, I::Vararg{Int,N}) where {N}
@inline element(A::Union{Array,CuArray,ROCArray}, I::Vararg{Int,N}) where {N} =
getindex(A, I...)
@inline function setelement!(
A::Union{Array,CuArray,ROCArray}, x::Number, I::Vararg{Int,N}
) where {N}
return setindex!(A, x, I...)
end

Expand Down
28 changes: 13 additions & 15 deletions src/stokes/Stokes2D.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ module Stokes2D

using ImplicitGlobalGrid
using ..JustRelax
using CUDA
using CUDA, AMDGPU
using ParallelStencil
using ParallelStencil.FiniteDifferences2D
using GeoParams, LinearAlgebra, Printf, TimerOutputs
using GeoParams, LinearAlgebra, Printf

import JustRelax: elastic_iter_params!, PTArray, Velocity, SymmetricTensor
import JustRelax:
Expand Down Expand Up @@ -100,7 +100,7 @@ function JustRelax.solve!(
# ~preconditioner
ητ = deepcopy(η)
# @hide_communication b_width begin # communication/computation overlap
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)
# end

Expand Down Expand Up @@ -222,7 +222,7 @@ function JustRelax.solve!(
# ~preconditioner
ητ = deepcopy(η)
# @hide_communication b_width begin # communication/computation overlap
compute_maxloc!(ητ, η; window=(1, 1, 1))
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)
# end

Expand Down Expand Up @@ -344,7 +344,7 @@ function JustRelax.solve!(
# ~preconditioner
ητ = deepcopy(η)
# @hide_communication b_width begin # communication/computation overlap
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)
# end

Expand Down Expand Up @@ -379,7 +379,7 @@ function JustRelax.solve!(
@parallel (@idx ni) compute_viscosity!(
η, ν, @strain(stokes)..., args, rheology, viscosity_cutoff
)
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)

@parallel (@idx ni) compute_τ_nonlinear!(
Expand Down Expand Up @@ -488,7 +488,7 @@ function JustRelax.solve!(
# ~preconditioner
ητ = deepcopy(η)
# @hide_communication b_width begin # communication/computation overlap
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)
# end

Expand All @@ -509,7 +509,6 @@ function JustRelax.solve!(
# solver loop
wtime0 = 0.0
λ = @zeros(ni...)
to = TimerOutput()
η0 = deepcopy(η)
do_visc = true
GC.enable(false)
Expand All @@ -530,9 +529,9 @@ function JustRelax.solve!(
θ_dτ,
)

# if rem(iter, 5) == 0
# @timeit to "ρg" @parallel (@idx ni) compute_ρg!(ρg[2], phase_ratios.center, rheology, args)
# end
if rem(iter, 5) == 0
@parallel (@idx ni) compute_ρg!(ρg[2], phase_ratios.center, rheology, args)
end

@parallel (@idx ni .+ 1) compute_strain_rate!(
@strain(stokes)..., stokes.∇V, @velocity(stokes)..., _di...
Expand All @@ -543,7 +542,7 @@ function JustRelax.solve!(
end
if do_visc
ν = 1e-2
@timeit to "viscosity" compute_viscosity!(
compute_viscosity!(
η,
ν,
phase_ratios.center,
Expand All @@ -553,7 +552,7 @@ function JustRelax.solve!(
viscosity_cutoff,
)
end
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)

@parallel (@idx ni) compute_τ_nonlinear!(
Expand Down Expand Up @@ -635,7 +634,6 @@ function JustRelax.solve!(
norm_Ry=norm_Ry,
norm_∇V=norm_∇V,
)
return to
end

function JustRelax.solve!(
Expand Down Expand Up @@ -667,7 +665,7 @@ function JustRelax.solve!(
# ~preconditioner
ητ = deepcopy(η)
# @hide_communication b_width begin # communication/computation overlap
compute_maxloc!(ητ, η)
compute_maxloc!(ητ, η; window=(1, 1))
update_halo!(ητ)
# end

Expand Down
2 changes: 1 addition & 1 deletion src/stokes/Stokes3D.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ using ImplicitGlobalGrid
using ParallelStencil
using ParallelStencil.FiniteDifferences3D
using JustRelax
using CUDA
using CUDA, AMDGPU
using LinearAlgebra
using Printf
using GeoParams
Expand Down
6 changes: 3 additions & 3 deletions src/thermal_diffusion/DiffusionExplicit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ using ParallelStencil.FiniteDifferences1D
using JustRelax
using LinearAlgebra
using Printf
using CUDA
using CUDA, AMDGPU

import JustRelax:
ThermalParameters, solve!, assign!, thermal_boundary_conditions!, update_T!
Expand Down Expand Up @@ -155,7 +155,7 @@ module ThermalDiffusion2D
using ParallelStencil
using ParallelStencil.FiniteDifferences2D
using JustRelax
using CUDA
using CUDA, AMDGPU
using GeoParams

import JustRelax: ThermalParameters, solve!, assign!, thermal_boundary_conditions!
Expand Down Expand Up @@ -511,7 +511,7 @@ using ParallelStencil.FiniteDifferences3D
using JustRelax
using MPI
using Printf
using CUDA
using CUDA, AMDGPU
using GeoParams

import JustRelax:
Expand Down