PTsolvers · albert-de-montserrat · Oct 16, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 13, 2023
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Patrick Sanan <[email protected]>, Albert De Montserrat <alber
 version = "0.1.0"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellArrays = "d35fcfd7-7af4-4c67-b1aa-d78070614af4"
 GeoParams = "e018b62d-d9de-4a26-8697-af89c310ae38"
@@ -16,7 +17,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [compat]

diff --git a/src/IO/DataIO.jl b/src/IO/DataIO.jl
@@ -3,7 +3,7 @@ module DataIO
 using WriteVTK
 using HDF5
 using MPI
-using CUDA
+using CUDA, AMDGPU
 
 import ..JustRelax: Geometry
 

diff --git a/src/IO/H5.jl b/src/IO/H5.jl
@@ -10,6 +10,7 @@ end
 
 _tocpu(x) = x
 _tocpu(x::T) where {T<:CuArray} = Array(x)
+_tocpu(x::T) where {T<:ROCArray} = Array(x)
 
 """
     checkpointing(dst, stokes, T, η, time)

diff --git a/src/JustRelax.jl b/src/JustRelax.jl
@@ -5,7 +5,7 @@ using Reexport
 @reexport using ImplicitGlobalGrid
 using LinearAlgebra
 using Printf
-using CUDA
+using CUDA, AMDGPU
 using MPI
 using GeoParams
 using HDF5

diff --git a/src/MetaJustRelax.jl b/src/MetaJustRelax.jl
@@ -7,19 +7,26 @@ struct PS_Setup{B,C}
 end
 
 function environment!(model::PS_Setup{T,N}) where {T,N}
-    gpu = model.device == :gpu ? true : false
 
     # call appropriate FD module
     Base.eval(@__MODULE__, Meta.parse("using ParallelStencil.FiniteDifferences$(N)D"))
     Base.eval(Main, Meta.parse("using ParallelStencil.FiniteDifferences$(N)D"))
 
     # start ParallelStencil
-    if model.device == :gpu
+    if model.device == :CUDA
         eval(:(@init_parallel_stencil(CUDA, $T, $N)))
         Base.eval(Main, Meta.parse("using CUDA"))
         if !isconst(Main, :PTArray)
             eval(:(const PTArray = CUDA.CuArray{$T,$N,CUDA.Mem.DeviceBuffer}))
         end
+
+    elseif model.device == :AMDGPU
+        eval(:(@init_parallel_stencil(AMDGPU, $T, $N)))
+        Base.eval(Main, Meta.parse("using AMDGPU"))
+        if !isconst(Main, :PTArray)
+            eval(:(const PTArray = AMDGPU.ROCArray{$T,$N,AMDGPU.Runtime.Mem.HIPBuffer}))
+        end
+
     else
         @eval begin
             @init_parallel_stencil(Threads, $T, $N)

diff --git a/src/Utils.jl b/src/Utils.jl
@@ -291,44 +291,13 @@ Compute the maximum value of `A` in the `window = (width_x, width_y, width_z)` a
 """
 function compute_maxloc!(B, A; window=(1, 1, 1))
     ni = size(A)
-    width_x, width_y, width_z = window
 
-    @parallel_indices (i, j) function _maxloc!(
-        B::T, A::T
-    ) where {T<:AbstractArray{<:Number,2}}
-        B[i, j] = _maxloc_window_clamped(A, i, j, width_x, width_y)
+    @parallel_indices (I...) function _maxloc!(B, A, window)
+        B[I...] = _maxloc_window_clamped(A, I..., window...)
         return nothing
     end
 
-    @parallel_indices (i, j, k) function _maxloc!(
-        B::T, A::T
-    ) where {T<:AbstractArray{<:Number,3}}
-        B[i, j, k] = _maxloc_window_clamped(A, i, j, k, width_x, width_y, width_z)
-        return nothing
-    end
-
-    @parallel (@idx ni) _maxloc!(B, A)
-end
-
-function _compute_maxloc!(B, A, window)
-    I = indices(window)
-
-    if all(I .<= size(A))
-        B[I...] = JustRelax._maxloc_window_clamped(A, I..., window...)
-    end
-
-    return nothing
-end
-
-function compute_maxloc!(
-    B::CuArray{T1,N,T2}, A::CuArray{T1,N,T2}; window=ntuple(i -> 1, Val(N))
-) where {T1,T2,N}
-    nx, ny = size(A)
-    ntx, nty = 32, 32
-    blkx, blky = ceil(Int, nx / ntx), ceil(Int, ny / nty)
-    CUDA.@sync begin
-        @cuda threads = (ntx, nty) blocks = (blkx, blky) _compute_maxloc!(B, A, window)
-    end
+    @parallel (@idx ni) _maxloc!(B, A, window)
     return nothing
 end
 
@@ -404,7 +373,7 @@ Compute the time step `dt` for the velocity field `S.V` and the diffusive maximu
 
 @inline function compute_dt(V::NTuple, di, dt_diff)
     n = inv(length(V) + 0.1)
-    dt_adv = mapreduce(x -> x[1] * inv(maximum(abs.(x[2]))), min, zip(di, V)) * n
+    dt_adv = mapreduce(x -> x[1] * inv(maximum_mpi(abs.(x[2]))), min, zip(di, V)) * n
     return min(dt_diff, dt_adv)
 end
 """
@@ -454,21 +423,30 @@ end
 # MPI reductions 
 
 function mean_mpi(A)
-    mean_l = mean(A)
+    mean_l = _mean(A)
     return MPI.Allreduce(mean_l, MPI.SUM, MPI.COMM_WORLD) / MPI.Comm_size(MPI.COMM_WORLD)
 end
 
 function norm_mpi(A)
-    sum2_l = sum(A .^ 2)
+    sum2_l = _sum(A .^ 2)
     return sqrt(MPI.Allreduce(sum2_l, MPI.SUM, MPI.COMM_WORLD))
 end
 
 function minimum_mpi(A)
-    min_l = minimum(A)
+    min_l = _minimum(A)
     return MPI.Allreduce(min_l, MPI.MIN, MPI.COMM_WORLD)
 end
 
 function maximum_mpi(A)
-    max_l = maximum(A)
+    max_l = _maximum(A)
     return MPI.Allreduce(max_l, MPI.MAX, MPI.COMM_WORLD)
 end
+
+for (f1, f2) in zip(
+    (:_mean, :_norm, :_minimum, :_maximum, :_sum), (:mean, :norm, :minimum, :maximum, :sum)
+)
+    @eval begin
+        $f1(A::ROCArray) = $f2(Array(A))
+        $f1(A) = $f2(A)
+    end
+end
diff --git a/src/phases/CallArrays.jl b/src/phases/CallArrays.jl
@@ -120,8 +120,11 @@ end
 ## Fallbacks
 import Base: getindex, setindex!
 
-@inline element(A::Union{Array,CuArray}, I::Vararg{Int,N}) where {N} = getindex(A, I...)
-@inline function setelement!(A::Union{Array,CuArray}, x::Number, I::Vararg{Int,N}) where {N}
+@inline element(A::Union{Array,CuArray,ROCArray}, I::Vararg{Int,N}) where {N} =
+    getindex(A, I...)
+@inline function setelement!(
+    A::Union{Array,CuArray,ROCArray}, x::Number, I::Vararg{Int,N}
+) where {N}
     return setindex!(A, x, I...)
 end
 

diff --git a/src/stokes/Stokes2D.jl b/src/stokes/Stokes2D.jl
@@ -39,10 +39,10 @@ module Stokes2D
 
 using ImplicitGlobalGrid
 using ..JustRelax
-using CUDA
+using CUDA, AMDGPU
 using ParallelStencil
 using ParallelStencil.FiniteDifferences2D
-using GeoParams, LinearAlgebra, Printf, TimerOutputs
+using GeoParams, LinearAlgebra, Printf
 
 import JustRelax: elastic_iter_params!, PTArray, Velocity, SymmetricTensor
 import JustRelax:
@@ -100,7 +100,7 @@ function JustRelax.solve!(
     # ~preconditioner
     ητ = deepcopy(η)
     # @hide_communication b_width begin # communication/computation overlap
-    compute_maxloc!(ητ, η)
+    compute_maxloc!(ητ, η; window=(1, 1))
     update_halo!(ητ)
     # end
 
@@ -222,7 +222,7 @@ function JustRelax.solve!(
     # ~preconditioner
     ητ = deepcopy(η)
     # @hide_communication b_width begin # communication/computation overlap
-    compute_maxloc!(ητ, η; window=(1, 1, 1))
+    compute_maxloc!(ητ, η; window=(1, 1))
     update_halo!(ητ)
     # end
 
@@ -344,7 +344,7 @@ function JustRelax.solve!(
     # ~preconditioner
     ητ = deepcopy(η)
     # @hide_communication b_width begin # communication/computation overlap
-    compute_maxloc!(ητ, η)
+    compute_maxloc!(ητ, η; window=(1, 1))
     update_halo!(ητ)
     # end
 
@@ -379,7 +379,7 @@ function JustRelax.solve!(
             @parallel (@idx ni) compute_viscosity!(
                 η, ν, @strain(stokes)..., args, rheology, viscosity_cutoff
             )
-            compute_maxloc!(ητ, η)
+            compute_maxloc!(ητ, η; window=(1, 1))
             update_halo!(ητ)
 
             @parallel (@idx ni) compute_τ_nonlinear!(
@@ -488,7 +488,7 @@ function JustRelax.solve!(
     # ~preconditioner
     ητ = deepcopy(η)
     # @hide_communication b_width begin # communication/computation overlap
-    compute_maxloc!(ητ, η)
+    compute_maxloc!(ητ, η; window=(1, 1))
     update_halo!(ητ)
     # end
 
@@ -509,7 +509,6 @@ function JustRelax.solve!(
     # solver loop
     wtime0 = 0.0
     λ = @zeros(ni...)
-    to = TimerOutput()
     η0 = deepcopy(η)
     do_visc = true
     GC.enable(false)
@@ -530,9 +529,9 @@ function JustRelax.solve!(
                 θ_dτ,
             )
 
-            # if rem(iter, 5) == 0
-            # @timeit to "ρg" @parallel (@idx ni) compute_ρg!(ρg[2], phase_ratios.center, rheology, args)
-            # end
+            if rem(iter, 5) == 0
+                @parallel (@idx ni) compute_ρg!(ρg[2], phase_ratios.center, rheology, args)
+            end
 
             @parallel (@idx ni .+ 1) compute_strain_rate!(
                 @strain(stokes)..., stokes.∇V, @velocity(stokes)..., _di...
@@ -543,7 +542,7 @@ function JustRelax.solve!(
             end
             if do_visc
                 ν = 1e-2
-                @timeit to "viscosity" compute_viscosity!(
+                compute_viscosity!(
                     η,
                     ν,
                     phase_ratios.center,
@@ -553,7 +552,7 @@ function JustRelax.solve!(
                     viscosity_cutoff,
                 )
             end
-            compute_maxloc!(ητ, η)
+            compute_maxloc!(ητ, η; window=(1, 1))
             update_halo!(ητ)
 
             @parallel (@idx ni) compute_τ_nonlinear!(
@@ -635,7 +634,6 @@ function JustRelax.solve!(
         norm_Ry=norm_Ry,
         norm_∇V=norm_∇V,
     )
-    return to
 end
 
 function JustRelax.solve!(
@@ -667,7 +665,7 @@ function JustRelax.solve!(
     # ~preconditioner
     ητ = deepcopy(η)
     # @hide_communication b_width begin # communication/computation overlap
-    compute_maxloc!(ητ, η)
+    compute_maxloc!(ητ, η; window=(1, 1))
     update_halo!(ητ)
     # end
 

diff --git a/src/stokes/Stokes3D.jl b/src/stokes/Stokes3D.jl
@@ -41,7 +41,7 @@ using ImplicitGlobalGrid
 using ParallelStencil
 using ParallelStencil.FiniteDifferences3D
 using JustRelax
-using CUDA
+using CUDA, AMDGPU
 using LinearAlgebra
 using Printf
 using GeoParams

diff --git a/src/thermal_diffusion/DiffusionExplicit.jl b/src/thermal_diffusion/DiffusionExplicit.jl
@@ -21,7 +21,7 @@ using ParallelStencil.FiniteDifferences1D
 using JustRelax
 using LinearAlgebra
 using Printf
-using CUDA
+using CUDA, AMDGPU
 
 import JustRelax:
     ThermalParameters, solve!, assign!, thermal_boundary_conditions!, update_T!
@@ -155,7 +155,7 @@ module ThermalDiffusion2D
 using ParallelStencil
 using ParallelStencil.FiniteDifferences2D
 using JustRelax
-using CUDA
+using CUDA, AMDGPU
 using GeoParams
 
 import JustRelax: ThermalParameters, solve!, assign!, thermal_boundary_conditions!
@@ -511,7 +511,7 @@ using ParallelStencil.FiniteDifferences3D
 using JustRelax
 using MPI
 using Printf
-using CUDA
+using CUDA, AMDGPU
 using GeoParams
 
 import JustRelax: