diff --git a/CMakeLists.txt b/CMakeLists.txt index d9d7c62..52acaaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,7 +61,8 @@ endif() ecbuild_add_option( FEATURE ACC DEFAULT ON DESCRIPTION "Support for using GPUs with OpenACC" - REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" ) + REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" + CONDITION CMAKE_Fortran_COMPILER_ID MATCHES "PGI|NVHPC") ## set general compiler flags include(cmake/field_api_compile_options.cmake) @@ -124,6 +125,13 @@ ecbuild_add_option( FEATURE BUDDY_MALLOC DEFAULT ON ) +## Field Gang +ecbuild_add_option( FEATURE FIELD_GANG + DESCRIPTION "Enable packed storage of fields" + DEFAULT ON + CONDITION (NOT CMAKE_Fortran_COMPILER_ID MATCHES "Cray") +) + check_symbol_exists(backtrace execinfo.h HAVE_BACKTRACE) ## fypp preprocessor flags @@ -136,12 +144,21 @@ endif() if(fiat_FOUND) list( APPEND fypp_defines "-DWITH_FIAT") endif() +if(HAVE_FIELD_GANG) + list( APPEND fypp_defines "-DHAVE_GANG") +endif() +unset( ranksuff_srcs ) ## preprocess fypp files +list( APPEND ranksuff_srcs _shuffle _access _util _array _array_util _factory _gather _data) +if(HAVE_FIELD_GANG) + list( APPEND ranksuff_srcs _gang ) +endif() + foreach (SUFF IN ITEMS IM RM RD LM) string (TOLOWER ${SUFF} suff) foreach (RANK RANGE 1 5) - foreach (FUNC IN ITEMS "" _shuffle _access _util _array _array_util _gang _factory _gather _data) + foreach (FUNC ${ranksuff_srcs} "") add_custom_command (OUTPUT field_${RANK}${suff}${FUNC}_module.F90 COMMAND ${FYPP} -DRANK=${RANK} -DSUFF='${SUFF}' ${fypp_defines} -m os -M ${CMAKE_CURRENT_SOURCE_DIR} -m fieldType ${CMAKE_CURRENT_SOURCE_DIR}/field_RANKSUFF${FUNC}_module.fypp > field_${RANK}${suff}${FUNC}_module.F90 @@ -153,9 +170,15 @@ foreach (SUFF IN ITEMS IM RM RD LM) endforeach () endforeach () -foreach (SRC IN ITEMS dev_alloc_module field_factory_module field_access_module field_gang_module field_array_module field_module +unset( non_ranksuff_srcs ) +list ( APPEND non_ranksuff_srcs dev_alloc_module field_factory_module field_access_module field_array_module field_module field_shuffle_module field_util_module field_array_util_module field_shuffle_type_module host_alloc_module field_gathscat_module field_gathscat_type_module) +if(HAVE_FIELD_GANG) + list( APPEND non_ranksuff_srcs field_gang_module ) +endif() + +foreach (SRC ${non_ranksuff_srcs} ) add_custom_command (OUTPUT ${SRC}.F90 COMMAND ${FYPP} -m os ${fypp_defines} -M ${CMAKE_CURRENT_SOURCE_DIR} -m fieldType ${CMAKE_CURRENT_SOURCE_DIR}/${SRC}.fypp > ${SRC}.F90 DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${SRC}.fypp diff --git a/Readme.md b/Readme.md index 28e0edd..a84732b 100644 --- a/Readme.md +++ b/Readme.md @@ -44,15 +44,17 @@ Features of FIELD_API can be toggled by passing the following argument to the CM |:--- |:--- |:--- | | TESTS | ON | Build the testing suite. | | BUDDY_MALLOC | ON | Enable the use of a binary buddy memory allocator for the shadow host allocation for `FIELD%DEVPTR`. This option is switched off if CUDA is enabled.| -| ACC | ON | Enable the use of OpenACC for GPU offload. | +| ACC | ON | Enable the use of OpenACC for GPU offload. Currently only suppored on NVHPC. | | SINGLE_PRECISION | ON | Enable the compilation of field_api in single precision | | DOUBLE_PRECISION | ON | Enable the compilation of field_api in double precision | | CUDA | OFF | Enable the use of CUDA for GPU offload. Disables the use of the buddy memory allocator, removes the shadow host allocation for `FIELD%DEVPTR` and allocates owned fields (see below) in pinned (page-locked) host memory.| +| FIELD_GANG | ON | Enable packed storage of groups of fields. This feature is not supported for the Cray compiler as it cannot resolve the underlying polymorphism.| ## Supported compilers The library has been tested with the nvhpc toolkit from Nvidia, version 23.9 -and is continually tested with newer releases. It has also been tested on CPU -(-DENABLE_ACC=OFF) with GCC 12 and Intel 2021. The CI testing (CPU-only for now) uses GNU 11.4. +and is continually tested with newer releases. Please note that GPU offload is currently +only supported for Nvidia compilers. It has also been tested on CPU (-DENABLE_ACC=OFF) +with GCC 12, Intel 2021 and CCE17. # Field API types diff --git a/arch/eurohpc/lumi/cray-gpu/17.0.1/env.sh b/arch/eurohpc/lumi/cray-gpu/17.0.1/env.sh new file mode 100644 index 0000000..cd501b5 --- /dev/null +++ b/arch/eurohpc/lumi/cray-gpu/17.0.1/env.sh @@ -0,0 +1,71 @@ +# (C) Copyright 1988- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +# Source me to get the correct configure/build/run environment + +# Store tracing and disable (module is *way* too verbose) +{ tracing_=${-//[^x]/}; set +x; } 2>/dev/null + +module_load() { + echo "+ module load $1" + module load $1 +} +module_unload() { + echo "+ module unload $1" + module unload $1 +} + +# Unload to be certain +module reset + +# Load modules +module_load LUMI/24.03 +module_load partition/G +module_load PrgEnv-cray/8.4.0 +module_load cce/17.0.1 +# module_load cray-mpich/8.1.27 +module_load cray-mpich/8.1.29 +module_load craype-network-ofi +module_load rocm/6.0.3 +module_load buildtools/24.03 +# module_load Boost/1.82.0-cpeCray-23.09 +module_load cray-libsci/24.03.0 +module_load Boost/1.83.0-cpeCray-24.03 +module_load cray-python/3.10.10 +module_load craype-x86-trento +module_load craype-accel-amd-gfx90a + +### Handling of "magic" cray modules +# 1) Load the cray modules +# module_load cray-hdf5/1.12.2.7 +module_load cray-hdf5/1.12.2.11 +# 2) Store variables to locate the packages +_HDF5_ROOT=${CRAY_HDF5_PREFIX} +# 3) Unload the cray modules in reverse order, removing all the magic +module_unload cray-hdf5 +# 4) Define variables that CMake introspects +export HDF5_ROOT=${_HDF5_ROOT} + +# Export environment variable3s +export MPI_HOME=${MPICH_DIR} +export CC=cc +export CXX=CC +export FC=ftn +export HIPCXX=$(hipconfig --hipclangpath)/clang++ + +module list + +set -x + +# Restore tracing to stored setting +{ if [[ -n "$tracing_" ]]; then set -x; else set +x; fi } 2>/dev/null + +# export ECBUILD_TOOLCHAIN="./toolchain.cmake" +path=$BASH_SOURCE +DIR_PATH=$(dirname $path) +export ECBUILD_TOOLCHAIN=$DIR_PATH/toolchain.cmake diff --git a/arch/eurohpc/lumi/cray-gpu/17.0.1/toolchain.cmake b/arch/eurohpc/lumi/cray-gpu/17.0.1/toolchain.cmake new file mode 100644 index 0000000..5a054ef --- /dev/null +++ b/arch/eurohpc/lumi/cray-gpu/17.0.1/toolchain.cmake @@ -0,0 +1,52 @@ +# (C) Copyright 1988- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +#################################################################### +# COMPILER +#################################################################### + +set( ECBUILD_FIND_MPI OFF ) +set( ENABLE_USE_STMT_FUNC ON CACHE STRING "" ) + +#################################################################### +# OpenMP FLAGS +#################################################################### + +set( OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" ) +set( OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" ) +set( OpenMP_Fortran_FLAGS "-homp -hlist=aimd" CACHE STRING "" ) +set( OpenMP_C_LIB_NAMES "craymp" CACHE STRING "" ) +set( OpenMP_CXX_LIB_NAMES "craymp" CACHE STRING "" ) +set( OpenMP_Fortran_LIB_NAMES "craymp" CACHE STRING "" ) +set( OpenMP_craymp_LIBRARY "/opt/cray/pe/cce/17.0.1/cce/x86_64/lib/libcraymp.so" CACHE STRING "" ) + +#################################################################### +# OpenACC FLAGS +#################################################################### + +set( OpenACC_C_FLAGS "-hacc" CACHE STRING "" ) +# set( OpenACC_CXX_FLAGS "-hacc" CACHE STRING "" ) +# set( OpenACC_Fortran_FLAGS "-hacc" CACHE STRING "" ) + +#################################################################### +# Compiler FLAGS +#################################################################### + +# General Flags (add to default) +set(ECBUILD_Fortran_FLAGS "-h acc_model=auto_async_none") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -hbyteswapio") +set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Wl, --as-needed") + +set(ECBUILD_Fortran_FLAGS_BIT "-O3 -hfp1 -hscalar3 -hvector3 -G2 -haggress -DNDEBUG") + +if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES gfx90a) +endif() + +# select OpenMP pragma to be used +# set( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL OFF CACHE BOOL "" ) diff --git a/field_RANKSUFF_data_module.fypp b/field_RANKSUFF_data_module.fypp index a727786..562a985 100644 --- a/field_RANKSUFF_data_module.fypp +++ b/field_RANKSUFF_data_module.fypp @@ -28,7 +28,11 @@ PUBLIC :: ${ftn}$_COPY_INTF ABSTRACT INTERFACE SUBROUTINE ${ftn}$_COPY_INTF (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) +#:if 'IM' in ftn + IMPORT :: ${ft.kind}$ +#:else IMPORT :: JPIM, ${ft.kind}$ +#:endif ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) LOGICAL, INTENT (IN) :: MAP_DEVPTR INTEGER (KIND=JPIM), INTENT (IN) :: KDIR diff --git a/field_RANKSUFF_factory_module.fypp b/field_RANKSUFF_factory_module.fypp index 480f25a..3681af0 100644 --- a/field_RANKSUFF_factory_module.fypp +++ b/field_RANKSUFF_factory_module.fypp @@ -13,10 +13,12 @@ MODULE FIELD_${RANK}$${SUFF}$_FACTORY_MODULE #:for ft in fieldTypeList USE ${ft.name}$_MODULE +#:if defined('HAVE_GANG') #:if ft.ganged USE FIELD_${ft.rank-1}$${SUFF}$_MODULE USE ${ft.name}$_GANG_MODULE #:endif +#:endif #:endfor ${fieldType.useParkind1 ()}$ @@ -29,10 +31,12 @@ INTERFACE FIELD_NEW #:for ft in fieldTypeList MODULE PROCEDURE ${ft.name}$_NEW_OWNER MODULE PROCEDURE ${ft.name}$_NEW_WRAPPER +#:if defined('HAVE_GANG') #:if ft.ganged MODULE PROCEDURE ${ft.name}$_NEW_GANG_WRAPPER MODULE PROCEDURE ${ft.name}$_NEW_GANG_OWNER #:endif +#:endif #:endfor END INTERFACE @@ -108,6 +112,7 @@ IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_NEW () END SUBROUTINE +#:if defined('HAVE_GANG') #:if ft.ganged SUBROUTINE ${ft.name}$_NEW_GANG_WRAPPER (FIELD_PTR, CHILDREN, LBOUNDS, PERSISTENT, DATA, SYNC_ON_FINAL, INITIALIZED) @@ -180,6 +185,7 @@ IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_NEW () END SUBROUTINE +#:endif #:endif SUBROUTINE ${ft.name}$_DELETE (FIELD_PTR) diff --git a/field_RANKSUFF_gather_module.fypp b/field_RANKSUFF_gather_module.fypp index c14fcea..f875085 100644 --- a/field_RANKSUFF_gather_module.fypp +++ b/field_RANKSUFF_gather_module.fypp @@ -30,21 +30,14 @@ PUBLIC :: FIELD_SHUFFLE #:for what in ['DEVICE', 'HOST'] #:for mode in ['RDONLY', 'RDWR'] -INTERFACE GATHER_${what}$_DATA_${mode}$ + #:for ft in fieldTypeList - MODULE PROCEDURE :: ${ft.name}$_GATHER_${what}$_DATA_${mode}$ +PUBLIC :: ${ft.name}$_GATHER_${what}$_DATA_${mode}$ #:endfor -END INTERFACE - -PUBLIC :: GATHER_${what}$_DATA_${mode}$ -INTERFACE SGATHER_${what}$_DATA_${mode}$ #:for ft in fieldTypeList - MODULE PROCEDURE :: S${ft.name}$_GATHER_${what}$_DATA_${mode}$ +PUBLIC :: S${ft.name}$_GATHER_${what}$_DATA_${mode}$ #:endfor -END INTERFACE - -PUBLIC :: SGATHER_${what}$_DATA_${mode}$ #:endfor #:endfor diff --git a/field_gathscat_module.fypp b/field_gathscat_module.fypp index addc234..d2bdfbb 100644 --- a/field_gathscat_module.fypp +++ b/field_gathscat_module.fypp @@ -9,7 +9,7 @@ MODULE FIELD_GATHSCAT_MODULE -#:set fieldTypeList = fieldType.getFieldTypeList () +#:set fieldTypeList = fieldType.getFieldTypeList (hasView=True) USE FIELD_ACCESS_MODULE USE FIELD_FACTORY_MODULE @@ -30,9 +30,24 @@ PUBLIC :: FIELD_GATHSCAT #:for what in ['DEVICE', 'HOST'] #:for mode in ['RDONLY', 'RDWR'] +INTERFACE GATHER_${what}$_DATA_${mode}$ +#:for ft in fieldTypeList + MODULE PROCEDURE ${ft.name}$_GATHER_${what}$_DATA_${mode}$ +#:endfor +END INTERFACE GATHER_${what}$_DATA_${mode}$ + PUBLIC :: GATHER_${what}$_DATA_${mode}$ +INTERFACE SGATHER_${what}$_DATA_${mode}$ +#:for ft in fieldTypeList + MODULE PROCEDURE S${ft.name}$_GATHER_${what}$_DATA_${mode}$ +#:endfor +END INTERFACE SGATHER_${what}$_DATA_${mode}$ + +PUBLIC :: SGATHER_${what}$_DATA_${mode}$ + #:endfor #:endfor + END MODULE FIELD_GATHSCAT_MODULE diff --git a/field_shuffle_module.fypp b/field_shuffle_module.fypp index f40d7a5..dc28e1c 100644 --- a/field_shuffle_module.fypp +++ b/field_shuffle_module.fypp @@ -14,11 +14,15 @@ MODULE FIELD_SHUFFLE_MODULE USE FIELD_ACCESS_MODULE USE FIELD_FACTORY_MODULE USE FIELD_SHUFFLE_TYPE_MODULE +#:for what in ['DEVICE', 'HOST'] +#:for mode in ['RDONLY', 'RDWR'] +USE FIELD_GATHSCAT_MODULE, ONLY : GATHER_${what}$_DATA_${mode}$ +#:endfor +#:endfor ${fieldType.useParkind1 ()}$ #:for ft in fieldTypeList USE ${ft.name}$_SHUFFLE_MODULE -USE ${ft.name}$_GATHER_MODULE #:endfor diff --git a/host_alloc_module.fypp b/host_alloc_module.fypp index 118c3eb..f5ef5d6 100644 --- a/host_alloc_module.fypp +++ b/host_alloc_module.fypp @@ -68,7 +68,7 @@ INTERFACE END INTERFACE TYPE :: MEM_BLOCK - TYPE(C_PTR) :: DATA + TYPE(C_PTR) :: DATA = C_NULL_PTR INTEGER(KIND=INT64) :: POS = 0 INTEGER(KIND=INT64) :: SIZE = 0 INTEGER :: NUMFLDS = 0 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a88899a..9893195 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,6 +9,9 @@ set(LIBNAME_PREC ${LIBNAME}_${DEFAULT_PRECISION}) +# set DEV_ALLOC_SIZE for tests +set(dev_alloc_size "268435456") # 256MB + ## Host-device ping-pong runner ecbuild_add_test( TARGET main.x @@ -20,6 +23,7 @@ ecbuild_add_test( OpenMP::OpenMP_Fortran $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> LINKER_LANGUAGE Fortran + ENVIRONMENT "DEV_ALLOC_SIZE=${dev_alloc_size}" ) target_link_options( main.x PRIVATE $<${HAVE_CUDA}:-gpu=pinned> ) target_compile_definitions( main.x PRIVATE $<${HAVE_CUDA}:_CUDA> ) @@ -75,7 +79,6 @@ list(APPEND TEST_FILES test_field1d.F90 test_field_array.F90 test_field_delete_on_null.F90 - test_gang.F90 test_get_device_data_wronly.F90 test_host_mem_pool.F90 test_lastdim.F90 @@ -87,6 +90,10 @@ list(APPEND TEST_FILES wrapper_modify_gpu.F90 ) +if(HAVE_FIELD_GANG) + list(APPEND TEST_FILES test_gang.F90) +endif() + #Place-holder for failing tests set(FAILING_TEST_FILES ) @@ -112,8 +119,11 @@ foreach(TEST_FILE ${TEST_FILES}) fiat OpenMP::OpenMP_Fortran $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> + DEFINITIONS + $<${HAVE_FIELD_GANG}:HAVE_FIELD_GANG> LINKER_LANGUAGE Fortran OMP ${omp_num_threads} + ENVIRONMENT "DEV_ALLOC_SIZE=${dev_alloc_size}" ) set_target_properties(${TEST_NAME}.x @@ -137,7 +147,7 @@ foreach(FAILING_TEST_FILE ${FAILING_TEST_FILES}) set_target_properties(${FAILING_TEST_NAME}.x PROPERTIES LINKER_LANGUAGE Fortran) add_test(NAME ${FAILING_TEST_NAME} COMMAND ${FAILING_TEST_NAME}.x) set_property(TEST ${FAILING_TEST_NAME} PROPERTY WILL_FAIL TRUE) - set_property(TEST ${FAILING_TEST_NAME} PROPERTY ENVIRONMENT "OMP_NUM_THREADS=${omp_num_threads}") + set_property(TEST ${FAILING_TEST_NAME} PROPERTY ENVIRONMENT "OMP_NUM_THREADS=${omp_num_threads};DEV_ALLOC_SIZE=${dev_alloc_size}") target_link_options( ${FAILING_TEST_NAME}.x PRIVATE $<${HAVE_CUDA}:-gpu=pinned> ) target_compile_definitions( ${FAILING_TEST_NAME}.x PRIVATE $<${HAVE_CUDA}:_CUDA> ) endforeach() @@ -150,7 +160,7 @@ foreach(ABOR1_TEST_FILE ${ABOR1_TEST_FILES}) target_link_libraries(${ABOR1_TEST_NAME}.x PRIVATE $<${HAVE_ACC}:OpenACC::OpenACC_Fortran>) set_target_properties(${ABOR1_TEST_NAME}.x PROPERTIES LINKER_LANGUAGE Fortran) add_test(NAME ${ABOR1_TEST_NAME} COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/abor1catcher.sh "./${ABOR1_TEST_NAME}.x") - set_property(TEST ${ABOR1_TEST_NAME} PROPERTY ENVIRONMENT "OMP_NUM_THREADS=${omp_num_threads}") + set_property(TEST ${ABOR1_TEST_NAME} PROPERTY ENVIRONMENT "OMP_NUM_THREADS=${omp_num_threads};DEV_ALLOC_SIZE=${dev_alloc_size}") target_link_options( ${ABOR1_TEST_NAME}.x PRIVATE $<${HAVE_CUDA}:-gpu=pinned> ) target_compile_definitions( ${ABOR1_TEST_NAME}.x PRIVATE $<${HAVE_CUDA}:_CUDA> ) endforeach() diff --git a/tests/test_wrappernosynconfinal.F90 b/tests/test_wrappernosynconfinal.F90 index b089945..7ddbccf 100644 --- a/tests/test_wrappernosynconfinal.F90 +++ b/tests/test_wrappernosynconfinal.F90 @@ -28,7 +28,9 @@ PROGRAM TEST_WRAPPERNOSYNCONFINAL PRINT *, " SYNC_ON_FINAL = ", LLSYNC_ON_FINAL (JSOF), " INITIALIZED = ", LLINITIALIZED (JSOC), " JINIT = ", JINIT CALL TEST_FIELD +#ifdef HAVE_FIELD_GANG CALL TEST_GANG +#endif ENDDO ENDDO @@ -120,6 +122,7 @@ SUBROUTINE TEST_FIELD END SUBROUTINE +#ifdef HAVE_FIELD_GANG SUBROUTINE TEST_GANG CLASS(FIELD_4RD), POINTER :: YLF4 @@ -218,5 +221,6 @@ SUBROUTINE TEST_GANG DEALLOCATE (ZDATA4) END SUBROUTINE +#endif END PROGRAM