Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Production de hpc tracer #594

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,12 @@ elif [[ $LOGINHOST =~ ^m[A-Za-z0-9]+\.hpc\.dkrz\.de$ ]]; then
STRATEGY="mistral.dkrz.de"
elif [[ $LOGINHOST =~ ^levante ]] || [[ $LOGINHOST =~ ^l[:alnum:]+\.lvt\.dkrz\.de$ ]]; then
STRATEGY="levante.dkrz.de"
elif [[ $LOGINHOST =~ ^ollie[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then
# following regex only matches if input is 2 word like levante.nvhpc, this enables using different shells for a machine directly
compid_regex="^([[:alnum:]]+)\.([[:alnum:]]+)$"
if [[ $LOGINHOST =~ $compid_regex ]]; then
COMPILERID="${BASH_REMATCH[2]}"
fi
elif [[ $LOGINHOST =~ ^ollie[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then
STRATEGY="ollie"
elif [[ $LOGINHOST =~ ^albedo[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then
STRATEGY="albedo"
Expand Down
5 changes: 3 additions & 2 deletions env/levante.dkrz.de/shell.nvhpc
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ export CPU_MODEL=AMD_EPYC_ZEN3
module --force purge
# module load intel-oneapi-compilers/2022.0.1-gcc-11.2.0
# module load openmpi/4.1.2-intel-2021.5.0
module load nvhpc/22.5-gcc-11.2.0
module load openmpi/.4.1.4-nvhpc-22.5
module load nvhpc/23.9-gcc-11.2.0
module load openmpi/4.1.6-nvhpc-23.9
export FC=mpif90 CC=mpicc CXX=mpicxx;
# export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH

module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0
module load netcdf-fortran/4.5.3-openmpi-4.1.2-intel-2021.5.0
Expand Down
27 changes: 17 additions & 10 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,13 @@ endif()

option(ENABLE_OPENACC "compile with OpenACC support" OFF)
message(STATUS "ENABLE_OPENACC: ${ENABLE_OPENACC}")

set(NV_GPU_ARCH "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")
option(DISABLE_OPENACC_ATOMICS "disable kernels using atomic statement for reproducible results" ON)
set(GPU_COMPUTE_CAPABILITY "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")
set(GPU_FLAGS "cuda12.2,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)")

option(ENABLE_OPENMP "build FESOM with OpenMP" OFF)
message(STATUS "ENABLE_OPENMP: ${ENABLE_OPENMP}")
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
find_package(OpenMP REQUIRED COMPONENTS Fortran)
endif()

Expand Down Expand Up @@ -184,7 +185,7 @@ target_link_libraries(${PROJECT_NAME} PRIVATE parms) #metis
target_link_libraries(${PROJECT_NAME} PRIVATE MPI::MPI_Fortran)

set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE Fortran)
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_Fortran)
endif()

Expand Down Expand Up @@ -256,7 +257,7 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU )
elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray )
#target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O3)
target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O2 -hnoacc -M878) #-hnoacc is a workaround for cray automatically activate -hacc, -M878 is to suppress ftn-878 warning
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
target_compile_options(${PROJECT_NAME} PRIVATE -homp)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -hnoomp)
Expand All @@ -269,13 +270,19 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray )
endif()
elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL NVHPC )
target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_NVHPC_WORKAROUNDS)
target_compile_options(${PROJECT_NAME} PRIVATE -fast -fastsse -O3 -Mallocatable=95 -Mr8 -pgf90libs)
if(ENABLE_OPENACC)
#target_compile_options(${PROJECT_NAME} PRIVATE -fast -fastsse -O3 -Mallocatable=95 -Mr8 -pgf90libs)
target_compile_options(${PROJECT_NAME} PRIVATE -Mnofma -Mallocatable=95 -Mr8 -pgf90libs)
if(${ENABLE_OPENACC})
# additional compiler settings
target_compile_options(${PROJECT_NAME} PRIVATE -acc -ta=tesla:${NV_GPU_ARCH} -Minfo=accel)
set(CMAKE_EXE_LINKER_FLAGS "-acc -ta=tesla:${NV_GPU_ARCH}")
message("Taking ENABLE_OPENACC = ON")
target_compile_options(${PROJECT_NAME} PRIVATE -acc -O2 -gpu=${GPU_FLAGS} -Minfo=accel)
# set(CMAKE_EXE_LINKER_FLAGS "-acc -gpu=${GPU_FLAGS}")
if(${DISABLE_OPENACC_ATOMICS})
message("Taking DISABLE_OPENACC_ATOMICS = ON")
target_compile_definitions(${PROJECT_NAME} PRIVATE DISABLE_OPENACC_ATOMICS)
endif()
endif()
if(ENABLE_OPENMP)
if(${ENABLE_OPENMP})
target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast,inline)
Expand Down
6 changes: 6 additions & 0 deletions src/fesom_module.F90
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,9 @@ subroutine fesom_init(fesom_total_nsteps)
!$ACC CREATE (f%tracers%work%adv_flux_hor, f%tracers%work%adv_flux_ver, f%tracers%work%fct_LO) &
!$ACC CREATE (f%tracers%work%del_ttf_advvert, f%tracers%work%del_ttf_advhoriz, f%tracers%work%edge_up_dn_grad) &
!$ACC CREATE (f%tracers%work%del_ttf)

!! Creating variables in GPU memory for init_tracers_AB module
!$ACC ENTER DATA CREATE(tr_xy, tr_z, relax2clim, Sclim, Tclim)
end subroutine


Expand Down Expand Up @@ -632,6 +635,9 @@ subroutine fesom_finalize()
!$ACC EXIT DATA DELETE (f%dynamics%w, f%dynamics%w_e, f%dynamics%uv)
!$ACC EXIT DATA DELETE (f%dynamics, f%tracers)

!!$ Deleting init_tracers_AB values
!$ACC EXIT DATA DELETE (tr_xy, tr_z, relax2clim, Sclim, Tclim)

!delete mesh and partit data.
!$ACC EXIT DATA DELETE (f%mesh%coriolis_node, f%mesh%nn_num, f%mesh%nn_pos)
!$ACC EXIT DATA DELETE (f%mesh%ssh_stiff, f%mesh%ssh_stiff%rowptr)
Expand Down
4 changes: 4 additions & 0 deletions src/ice_fct.F90
Original file line number Diff line number Diff line change
Expand Up @@ -1122,7 +1122,11 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh)
call exchange_nod(ice_temp, partit, luse_g2g = .true.)
#endif

#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO
#else
!$ACC END DATA
#endif

!$OMP BARRIER
end subroutine ice_fem_fct
Expand Down
12 changes: 10 additions & 2 deletions src/oce_ale_tracer.F90
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh)
subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh)
use g_config
use o_PARAM, only: SPP, Fer_GM
!tr_xy and tr_z are needed cause, we are writing them on the GPU in init_tracers_AB subroutine
!and updating them so HOST can have access to them
use o_arrays, only: tr_xy, tr_z
use mod_mesh
USE MOD_PARTIT
USE MOD_PARSUP
Expand Down Expand Up @@ -219,15 +222,20 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh)
! do tracer AB (Adams-Bashfort) interpolation only for advectiv part
! needed
if (flag_debug .and. mype==0) print *, achar(27)//'[37m'//' --> call init_tracers_AB'//achar(27)//'[0m'
!$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB)
call init_tracers_AB(tr_num, tracers, partit, mesh)
!$ACC UPDATE HOST(tr_xy, tr_z)


! advect tracers
if (flag_debug .and. mype==0) print *, achar(27)//'[37m'//' --> call adv_tracers_ale'//achar(27)//'[0m'


!here update only those initialized in the init_tracers. (values, valuesAB, edge_up_dn_grad, ...)
!$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) &
!$ACC DEVICE(tracers%work%edge_up_dn_grad) !!&
!!!! UPDATE from hpc_tracer !!!!
!we dont have to update because we are updating before init_tracers_AB
!!$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) &
!$ACC UPDATE DEVICE(tracers%work%edge_up_dn_grad) !!&
! it will update del_ttf with contributions from horizontal and vertical advection parts (del_ttf_advhoriz and del_ttf_advvert)
!$ACC wait(1)
call do_oce_adv_tra(dt, UV, Wvel, Wvel_i, Wvel_e, tr_num, dynamics, tracers, partit, mesh)
Expand Down
116 changes: 103 additions & 13 deletions src/oce_tracer_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
type(t_tracer), intent(inout), target :: tracers
integer :: n,nz

#ifndef ENABLE_OPENACC
#else
!$ACC parallel loop collapse(2) default(present) !!!async(1)
#endif
do n=1, partit%myDim_nod2D+partit%eDim_nod2D
do nz=1, mesh%nl-1
! del_ttf will contain all advection / diffusion contributions for this tracer. Set it to 0 at the beginning!
Expand All @@ -34,52 +37,103 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
tracers%work%del_ttf_advvert (nz, n) = 0.0_WP
end do
end do
#ifndef ENABLE_OPENACC
#else
!$ACC end parallel loop
#endif

! AB interpolation
if (tracers%data(tr_num)%AB_order==2) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO
#else
!$ACC parallel loop collapse(2)
#endif
do n=1, partit%myDim_nod2D+partit%eDim_nod2D
! AB interpolation
if (tracers%data(tr_num)%AB_order==2) then
tracers%data(tr_num)%valuesAB(:, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, :, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(:, n)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to that I understand, can you let me know if the following is true @basava70?:

If we run this on GPUs one kernel is taking care of performing this operation for the whole vector, but instead we want that one kernel operates that formula for each element in the vector correct? (And that's what you are doing with your change).

@suvarchal and @basava70, would this change slow down the code for CPUs?

elseif (tracers%data(tr_num)%AB_order==3) then
tracers%data(tr_num)%valuesAB(:, n) =5.0_WP*tracers%data(tr_num)%valuesold(2, :, n)-16.0_WP*tracers%data(tr_num)%valuesold(1, :, n)+23.0_WP*tracers%data(tr_num)%values(:, n)
tracers%data(tr_num)%valuesAB(:, n) =tracers%data(tr_num)%valuesAB(:, n)/12.0_WP
end if
do nz = 1, mesh%nl-1
tracers%data(tr_num)%valuesAB(nz, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, nz, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(nz, n)
end do
end do
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif

! AB interpolation contd
elseif (tracers%data(tr_num)%AB_order==3) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO
#else
!$ACC parallel loop collapse(2)
#endif
do n=1, partit%myDim_nod2D+partit%eDim_nod2D
do nz = 1, mesh%nl-1
tracers%data(tr_num)%valuesAB(nz, n) =5.0_WP*tracers%data(tr_num)%valuesold(2, nz, n)-16.0_WP*tracers%data(tr_num)%valuesold(1, nz, n)+23.0_WP*tracers%data(tr_num)%values(nz, n)
tracers%data(tr_num)%valuesAB(nz, n) =tracers%data(tr_num)%valuesAB(nz, n)/12.0_WP
end do
end do
end if
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif

if (tracers%data(tr_num)%AB_order==2) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO
#else
!$ACC parallel loop collapse(2)
#endif
do n=1, partit%myDim_nod2d+partit%eDim_nod2D
tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
do nz = 1, mesh%nl-1
tracers%data(tr_num)%valuesold(1, nz, n)=tracers%data(tr_num)%values(nz, n)
end do
end do
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif

elseif (tracers%data(tr_num)%AB_order==3) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO
#else
!$ACC parallel loop collapse(2)
#endif
do n=1, partit%myDim_nod2d+partit%eDim_nod2D
tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n)
tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
do nz = 1, mesh%nl-1
tracers%data(tr_num)%valuesold(2, nz, n)=tracers%data(tr_num)%valuesold(1, nz, n)
tracers%data(tr_num)%valuesold(1, nz, n)=tracers%data(tr_num)%values(nz, n)
end do
end do
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif
end if

if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m'
call tracer_gradient_elements(tracers%data(tr_num)%valuesAB, partit, mesh)
call exchange_elem_begin(tr_xy, partit)
call exchange_elem_begin(tr_xy, partit, luse_g2g = .true.)

if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_z'//achar(27)//'[0m'
call tracer_gradient_z(tracers%data(tr_num)%values, partit, mesh) !WHY NOT AB HERE? DSIDOREN!
call exchange_elem_end(partit) ! tr_xy used in fill_up_dn_grad
!$OMP BARRIER

call exchange_nod_begin(tr_z, partit) ! not used in fill_up_dn_grad
call exchange_nod_begin(tr_z, partit, luse_g2g = .true.) ! not used in fill_up_dn_grad

if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call fill_up_dn_grad'//achar(27)//'[0m'
call fill_up_dn_grad(tracers%work, partit, mesh)
call exchange_nod_end(partit) ! tr_z halos should have arrived by now.

if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m'
call tracer_gradient_elements(tracers%data(tr_num)%values, partit, mesh) !redefine tr_arr to the current timestep
call exchange_elem(tr_xy, partit)
call exchange_elem(tr_xy, partit, luse_g2g = .true.)

END SUBROUTINE init_tracers_AB
!
Expand All @@ -105,7 +159,12 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh)
#include "associate_mesh_def.h"
#include "associate_part_ass.h"
#include "associate_mesh_ass.h"
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, elnodes, nz, nzmin, nzmax)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are there so many variables private for OpenMP and only elnodes for OpenACC? (a question out of curiosity)

#else
!$ACC UPDATE DEVICE(gradient_sca)
!$ACC parallel loop private(elnodes)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this missing a collapse(2) or would that create some sort of conflict/race-condition related to the elnodes being defined private?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see... the nz loop depends on number of levels which varies between different elements so the loops cannot be collapsed. So you can ignore my question above.

#endif
DO elem=1, myDim_elem2D
elnodes=elem2D_nodes(:,elem)
nzmin = ulevels(elem)
Expand All @@ -116,7 +175,11 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh)
tr_xy(2,nz, elem)=sum(gradient_sca(4:6,elem)*ttf(nz,elnodes))
END DO
END DO
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif
END SUBROUTINE tracer_gradient_elements
!
!
Expand All @@ -141,7 +204,12 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh)
#include "associate_mesh_def.h"
#include "associate_part_ass.h"
#include "associate_mesh_ass.h"
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nz, nzmin, nzmax, dz)
#else
!$ACC UDPATE DEVICE(hnode_new)
!$ACC parallel loop
#endif
DO n=1, myDim_nod2D+eDim_nod2D
!!PS nlev=nlevels_nod2D(n)
nzmax=nlevels_nod2D(n)
Expand All @@ -156,7 +224,11 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh)
tr_z(nzmin, n)=0.0_WP
tr_z(nzmax, n)=0.0_WP
END DO
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif
END SUBROUTINE tracer_gradient_z
!
!
Expand Down Expand Up @@ -184,7 +256,12 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh)
trarr=>tracers%data(tr_num)%values(:,:)

if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==1)) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax)
#else
!$ACC UPDATE DEVICE(relax2clim, Tclim)
!$ACC parallel loop
#endif
DO n=1, myDim_nod2D
nzmin = ulevels_nod2D(n)
nzmax = nlevels_nod2D(n)
Expand All @@ -193,17 +270,30 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh)
trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+&
relax2clim(n)*dt*(Tclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n))
END DO
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif
END if
if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==2)) then
#ifndef ENABLE_OPENACC
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax)
#else
!$ACC UPDATE DEVICE(relax2clim, Sclim)
!$ACC parallel loop
#endif
DO n=1, myDim_nod2D
nzmin = ulevels_nod2D(n)
nzmax = nlevels_nod2D(n)
trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+&
relax2clim(n)*dt*(Sclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n))
END DO
#ifndef ENABLE_OPENACC
!$OMP END PARALLEL DO
#else
!$ACC end parallel loop
#endif
END IF
END SUBROUTINE relax_to_clim
END MODULE o_tracers
Loading