diff --git a/env/levante.dkrz.de/shell.nvhpc b/env/levante.dkrz.de/shell.nvhpc index eb2b776f6..5f9bf063b 100755 --- a/env/levante.dkrz.de/shell.nvhpc +++ b/env/levante.dkrz.de/shell.nvhpc @@ -5,8 +5,8 @@ export CPU_MODEL=AMD_EPYC_ZEN3 module --force purge # module load intel-oneapi-compilers/2022.0.1-gcc-11.2.0 # module load openmpi/4.1.2-intel-2021.5.0 -module load nvhpc/22.5-gcc-11.2.0 -module load openmpi/.4.1.4-nvhpc-22.5 +module load nvhpc/23.9-gcc-11.2.0 +module load openmpi/4.1.6-nvhpc-23.9 export FC=mpif90 CC=mpicc CXX=mpicxx; module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b566d2637..223d5766b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,13 +42,13 @@ elseif(${FESOM_PLATFORM_STRATEGY} STREQUAL albedo ) message(STATUS "multithreading disabled for Albedo") # multithreading suddenly produces an error, disable it until a fix is found. issue #413 option(DISABLE_MULTITHREADING "disable asynchronous operations" ON) else() - option(DISABLE_MULTITHREADING "disable asynchronous operations" OFF) + option(DISABLE_MULTITHREADING "disable asynchronous operations" ON) endif() option(ENABLE_OPENACC "compile with OpenACC support" OFF) option(DISABLE_OPENACC_ATOMICS "disable kernels using atomic statement for reproducible results" ON) set(GPU_COMPUTE_CAPABILITY "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") -set(GPU_FLAGS "cuda11.7,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") +set(GPU_FLAGS "cuda12.2,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") option(ENABLE_OPENMP "build FESOM with OpenMP" OFF) if(${ENABLE_OPENMP}) diff --git a/src/fesom_module.F90 b/src/fesom_module.F90 index 2590252ca..816f6faba 100755 --- a/src/fesom_module.F90 +++ b/src/fesom_module.F90 @@ -365,6 +365,7 @@ subroutine fesom_runloop(current_nsteps) !$ACC CREATE (f%tracers%work%adv_flux_hor, f%tracers%work%adv_flux_ver, f%tracers%work%fct_LO) & !$ACC CREATE (f%tracers%work%del_ttf_advvert, f%tracers%work%del_ttf_advhoriz, f%tracers%work%edge_up_dn_grad) & !$ACC CREATE (f%tracers%work%del_ttf) + !$ACC DATA CREATE(tr_xy, tr_z, relax2clim, Sclim, Tclim) do n=nstart, ntotal if (use_global_tides) then call foreph(f%partit, f%mesh) @@ -465,6 +466,7 @@ subroutine fesom_runloop(current_nsteps) !$ACC EXIT DATA DELETE (f%tracers%work%adv_flux_hor, f%tracers%work%adv_flux_ver, f%tracers%work%fct_LO) !$ACC EXIT DATA DELETE (f%tracers%work%del_ttf_advvert, f%tracers%work%del_ttf_advhoriz, f%tracers%work%edge_up_dn_grad) !$ACC EXIT DATA DELETE (f%tracers%work%del_ttf) + !$ACC END DATA !$ACC EXIT DATA DELETE (f%tracers%data, f%tracers%work) !$ACC EXIT DATA DELETE (f%dynamics%w, f%dynamics%w_e, f%dynamics%uv) diff --git a/src/oce_ale_tracer.F90 b/src/oce_ale_tracer.F90 index dab0f550a..74bcff29d 100644 --- a/src/oce_ale_tracer.F90 +++ b/src/oce_ale_tracer.F90 @@ -198,6 +198,7 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) ! do tracer AB (Adams-Bashfort) interpolation only for advectiv part ! needed if (flag_debug .and. mype==0) print *, achar(27)//'[37m'//' --> call init_tracers_AB'//achar(27)//'[0m' + !$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) call init_tracers_AB(tr_num, tracers, partit, mesh) ! advect tracers @@ -205,10 +206,10 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) !here update only those initialized in the init_tracers. (values, valuesAB, edge_up_dn_grad, ...) - !$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) & - !$ACC DEVICE(tracers%work%edge_up_dn_grad) !!& + !!$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) & + !!$ACC DEVICE(tracers%work%edge_up_dn_grad) !!& ! it will update del_ttf with contributions from horizontal and vertical advection parts (del_ttf_advhoriz and del_ttf_advvert) - !$ACC wait(1) + !!$ACC wait(1) call do_oce_adv_tra(dt, UV, Wvel, Wvel_i, Wvel_e, tr_num, dynamics, tracers, partit, mesh) diff --git a/src/oce_tracer_mod.F90 b/src/oce_tracer_mod.F90 index bc6182039..fa02464d8 100755 --- a/src/oce_tracer_mod.F90 +++ b/src/oce_tracer_mod.F90 @@ -25,7 +25,9 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) type(t_tracer), intent(inout), target :: tracers integer :: n,nz -!$ACC parallel loop collapse(2) default(present) async(1) +#ifdef ENABLE_OPENACC +!$ACC parallel loop collapse(2) +#endif do n=1, partit%myDim_nod2D+partit%eDim_nod2D do nz=1, mesh%nl-1 ! del_ttf will contain all advection / diffusion contributions for this tracer. Set it to 0 at the beginning! @@ -34,24 +36,37 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) tracers%work%del_ttf_advvert (nz, n) = 0.0_WP end do end do +#ifdef ENABLE_OPENACC !$ACC end parallel loop +#endif + +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO +#else +!$ACC parallel loop collapse(2) +#endif do n=1, partit%myDim_nod2D+partit%eDim_nod2D ! AB interpolation - tracers%data(tr_num)%valuesAB(:, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesAB(:, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(:, n) + do nz = 1, mesh%nl + tracers%data(tr_num)%valuesAB(nz, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesAB(nz, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(nz, n) + end do end do +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m' call tracer_gradient_elements(tracers%data(tr_num)%valuesAB, partit, mesh) - call exchange_elem_begin(tr_xy, partit) + call exchange_elem_begin(tr_xy, partit, luse_g2g = .true.) if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_z'//achar(27)//'[0m' call tracer_gradient_z(tracers%data(tr_num)%values, partit, mesh) !WHY NOT AB HERE? DSIDOREN! call exchange_elem_end(partit) ! tr_xy used in fill_up_dn_grad !$OMP BARRIER - call exchange_nod_begin(tr_z, partit) ! not used in fill_up_dn_grad + call exchange_nod_begin(tr_z, partit, luse_g2g = .true.) ! not used in fill_up_dn_grad if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call fill_up_dn_grad'//achar(27)//'[0m' call fill_up_dn_grad(tracers%work, partit, mesh) @@ -59,7 +74,7 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m' call tracer_gradient_elements(tracers%data(tr_num)%values, partit, mesh) !redefine tr_arr to the current timestep - call exchange_elem(tr_xy, partit) + call exchange_elem(tr_xy, partit, luse_g2g = .true.) END SUBROUTINE init_tracers_AB ! @@ -85,7 +100,11 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh) #include "associate_mesh_def.h" #include "associate_part_ass.h" #include "associate_mesh_ass.h" +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, elnodes, nz, nzmin, nzmax) +#else +!$ACC parallel loop private(elnodes) +#endif DO elem=1, myDim_elem2D elnodes=elem2D_nodes(:,elem) nzmin = ulevels(elem) @@ -96,7 +115,11 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh) tr_xy(2,nz, elem)=sum(gradient_sca(4:6,elem)*ttf(nz,elnodes)) END DO END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END SUBROUTINE tracer_gradient_elements ! ! @@ -121,7 +144,11 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh) #include "associate_mesh_def.h" #include "associate_part_ass.h" #include "associate_mesh_ass.h" +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nz, nzmin, nzmax, dz) +#else +!$ACC parallel loop +#endif DO n=1, myDim_nod2D+eDim_nod2D !!PS nlev=nlevels_nod2D(n) nzmax=nlevels_nod2D(n) @@ -136,7 +163,11 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh) tr_z(nzmin, n)=0.0_WP tr_z(nzmax, n)=0.0_WP END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END SUBROUTINE tracer_gradient_z ! ! @@ -164,7 +195,12 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh) trarr=>tracers%data(tr_num)%values(:,:) if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==1)) then + #ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax) +#else +!$ACC update device(relax2clim, Tclim) +!$ACC parallel loop +#endif DO n=1, myDim_nod2D nzmin = ulevels_nod2D(n) nzmax = nlevels_nod2D(n) @@ -173,17 +209,30 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh) trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+& relax2clim(n)*dt*(Tclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n)) END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END if if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==2)) then + #ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax) +#else +!$ACC update device(Sclim) +!$ACC parallel loop +#endif DO n=1, myDim_nod2D nzmin = ulevels_nod2D(n) nzmax = nlevels_nod2D(n) trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+& relax2clim(n)*dt*(Sclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n)) END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END IF END SUBROUTINE relax_to_clim -END MODULE o_tracers +END MODULE o_tracers \ No newline at end of file