diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml new file mode 100644 index 000000000..76f917c9e --- /dev/null +++ b/data/cloud/openstack/arbutus.yaml @@ -0,0 +1,6 @@ +profile::gpu::install::vgpu::installer: rpm +profile::gpu::install::vgpu::rpm::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{facts.os.release.major}.noarch.rpm +profile::gpu::install::vgpu::rpm::packages: + - nvidia-vgpu-kmod + - nvidia-vgpu-gridd + - nvidia-vgpu-tools \ No newline at end of file diff --git a/data/cloud/openstack/jusuf-cloud.yaml b/data/cloud/openstack/jusuf-cloud.yaml new file mode 100644 index 000000000..a0d6fb46d --- /dev/null +++ b/data/cloud/openstack/jusuf-cloud.yaml @@ -0,0 +1,3 @@ +profile::gpu::install::vgpu::installer: bin +profile::gpu::install::vgpu::bin::source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/NVIDIA-Driver.latest +profile::gpu::install::vgpu::bin::gridd_source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/gridd.conf \ No newline at end of file diff --git a/data/software_stack/computecanada.yaml b/data/software_stack/computecanada.yaml index 040ec6213..57fe1b01a 100644 --- a/data/software_stack/computecanada.yaml +++ b/data/software_stack/computecanada.yaml @@ -22,3 +22,5 @@ profile::squid::server::cvmfs_acl_regex: - '^(cvmfs-.*\.computecanada\.net)$' - '^(.*-cvmfs\.openhtc\.io)$' - '^(cvmfs-.*\.genap\.ca)$' + +profile::gpu::install::lib_symlink_path: '/usr/lib64/nvidia' diff --git a/data/software_stack/eessi.yaml b/data/software_stack/eessi.yaml index fba8f6860..ee1586ed4 100644 --- a/data/software_stack/eessi.yaml +++ b/data/software_stack/eessi.yaml @@ -24,3 +24,5 @@ profile::cvmfs::client::lmod_default_modules: profile::squid::server::cvmfs_acl_regex: - '^(.*\.eessi-hpc\.org)$' + +profile::gpu::install::lib_symlink_path: '/opt/eessi/lib' diff --git a/site/profile/facts.d/nvidia_driver_vers.sh b/site/profile/facts.d/nvidia_driver_vers.sh index 0f804a112..cc36a85b7 100755 --- a/site/profile/facts.d/nvidia_driver_vers.sh +++ b/site/profile/facts.d/nvidia_driver_vers.sh @@ -1,16 +1,5 @@ #!/bin/sh -PROCESSOR=$(uname -p) -VERSION="$(source /etc/os-release; echo $VERSION_ID)" -PACKAGE="cuda-drivers" -PACKAGE_REGEX="${PACKAGE}-\([0-9.]\{1,\}\)[-0-9]*\.${PROCESSOR}" -DRIVER_VERSION=$(test -f /usr/sbin/dkms && /usr/sbin/dkms status | grep -m 1 -Po 'nvidia, \K(\d+.\d+[\.]\d*)') -# If that didn't work let's try nvidia-smi -if [ -z $DRIVER_VERSION ]; then - DRIVER_VERSION=$(if [ -x "$(command -v nvidia-smi)" ]; then nvidia-smi --query-gpu=driver_version --format=csv,noheader; fi) +if [ -e /proc/driver/nvidia ]; then + DRIVER_VERSION=$(grep -m 1 -Po 'NVRM version:.* \K(\d+\.\d+\.\d+)' /proc/driver/nvidia/version) fi -if [ -z $DRIVER_VERSION ]; then - BASE_URL="http://developer.download.nvidia.com/compute/cuda/repos" - CUDA_REPO_GZ=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/repodata/repomd.xml | sed '2 s/xmlns=".*"//g' | xmllint --xpath 'string(/repomd/data[@type="primary"]/location/@href)' -) - DRIVER_VERSION=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/${CUDA_REPO_GZ} | gunzip | sed -n "s/^.*\"${PACKAGE_REGEX}\.rpm\".*$/\1/p" | sort -V | tail -n1) -fi -echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }" +echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }" \ No newline at end of file diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 8d1e5f76f..a911e3207 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -15,28 +15,36 @@ } } -class profile::gpu::install { +class profile::gpu::install ( + String $lib_symlink_path = undef +) { + ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' }) + ensure_packages(['kernel-devel'], {ensure => 'installed'}) + ensure_packages(['dkms'], { + 'require' => Yumrepo['epel'] + }) + if ! $facts['nvidia_grid_vgpu'] { require profile::gpu::install::passthrough - $dkms_requirements = [ - Package['kernel-devel'], - Package['kmod-nvidia-latest-dkms'] - ] } else { require profile::gpu::install::vgpu - $dkms_requirements = [ - Package['kernel-devel'], - Package['nvidia-vgpu-kmod'] - ] } - ensure_packages(['kernel-devel'], {ensure => 'installed'}) - - exec { 'dkms autoinstall': - path => ['/usr/bin', '/usr/sbin'], - onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', - timeout => 0, - require => $dkms_requirements, + # Binary installer do not build drivers with DKMS + $installer = lookup('profile::gpu::install::vgpu::installer') + if ! $facts['nvidia_grid_vgpu'] or $installer != 'bin' { + exec { 'dkms autoinstall': + path => ['/usr/bin', '/usr/sbin'], + onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', + timeout => 0, + require => [ + Package['kernel-devel'], + Package['dkms'] + ] + } + $kmod_require = [Exec['dkms autoinstall']] + } else { + $kmod_require = [] } kmod::load { [ @@ -45,42 +53,95 @@ 'nvidia_modeset', 'nvidia_uvm' ]: - require => Exec['dkms autoinstall'] + require => $kmod_require } + if $lib_symlink_path { + $lib_symlink_path_split = split($lib_symlink_path, '/') + $lib_symlink_path_split[1,-1].each |Integer $index, String $value| { + ensure_resource('file', join($lib_symlink_path_split[0, $index+2], '/'), {'ensure' => 'directory'}) + } - file { '/usr/lib64/nvidia': - ensure => directory - } + $nvidia_libs = [ + 'libcuda.so.1', + 'libcuda.so', + 'libEGL_nvidia.so.0', + 'libGLESv1_CM_nvidia.so.1', + 'libGLESv2_nvidia.so.2', + 'libGLX_indirect.so.0', + 'libGLX_nvidia.so.0', + 'libnvcuvid.so.1', + 'libnvcuvid.so', + 'libnvidia-cfg.so.1', + 'libnvidia-cfg.so', + 'libnvidia-encode.so.1', + 'libnvidia-encode.so', + 'libnvidia-fbc.so.1', + 'libnvidia-fbc.so', + 'libnvidia-ifr.so.1', + 'libnvidia-ifr.so', + 'libnvidia-ml.so.1', + 'libnvidia-ml.so', + 'libnvidia-opencl.so.1', + 'libnvidia-opticalflow.so.1', + 'libnvidia-ptxjitcompiler.so.1', + 'libnvidia-ptxjitcompiler.so', + 'libnvoptix.so.1', + ] + + $nvidia_libs.each |String $lib| { + file { "/usr/lib64/nvidia/${lib}": + ensure => link, + target => "/usr/lib64/${lib}", + seltype => 'lib_t' + } + } + + # WARNING : since the fact is computed before Puppet agent run, + # on a clean host, the symbolic links to the NVIDIA libraries + # that include the version number will be created on the + # second Puppet run only. + $driver_vers = $::facts['nvidia_driver_version'] + if $driver_vers != '' { + $nvidia_libs_vers = [ + "libcuda.so.${driver_vers}", + "libEGL_nvidia.so.${driver_vers}", + "libGLESv1_CM_nvidia.so.${driver_vers}", + "libGLESv2_nvidia.so.${driver_vers}", + "libGLX_nvidia.so.${driver_vers}", + "libnvcuvid.so.${driver_vers}", + "libnvidia-cbl.so.${driver_vers}", + "libnvidia-cfg.so.${driver_vers}", + "libnvidia-compiler.so.${driver_vers}", + "libnvidia-eglcore.so.${driver_vers}", + "libnvidia-encode.so.${driver_vers}", + "libnvidia-fatbinaryloader.so.${driver_vers}", + "libnvidia-fbc.so.${driver_vers}", + "libnvidia-glcore.so.${driver_vers}", + "libnvidia-glsi.so.${driver_vers}", + "libnvidia-glvkspirv.so.${driver_vers}", + "libnvidia-ifr.so.${driver_vers}", + "libnvidia-ml.so.${driver_vers}", + "libnvidia-opencl.so.${driver_vers}", + "libnvidia-opticalflow.so.${driver_vers}", + "libnvidia-ptxjitcompiler.so.${driver_vers}", + "libnvidia-rtcore.so.${driver_vers}", + "libnvidia-tls.so.${driver_vers}", + "libnvoptix.so.${driver_vers}" + ] - $driver_ver = $::facts['nvidia_driver_version'] - $nvidia_libs = [ - "libnvidia-ml.so.${driver_ver}", 'libnvidia-ml.so.1', 'libnvidia-fbc.so.1', - "libnvidia-fbc.so.${driver_ver}", 'libnvidia-ifr.so.1', "libnvidia-ifr.so.${driver_ver}", - 'libcuda.so', 'libcuda.so.1', "libcuda.so.${driver_ver}", "libnvcuvid.so.${driver_ver}", - 'libnvcuvid.so.1', "libnvidia-compiler.so.${driver_ver}", 'libnvidia-encode.so.1', - "libnvidia-encode.so.${driver_ver}", "libnvidia-fatbinaryloader.so.${driver_ver}", - 'libnvidia-opencl.so.1', "libnvidia-opencl.so.${driver_ver}", 'libnvidia-opticalflow.so.1', - "libnvidia-opticalflow.so.${driver_ver}", 'libnvidia-ptxjitcompiler.so.1', "libnvidia-ptxjitcompiler.so.${driver_ver}", - 'libnvcuvid.so', 'libnvidia-cfg.so', 'libnvidia-encode.so', - 'libnvidia-fbc.so', 'libnvidia-ifr.so', 'libnvidia-ml.so', - 'libnvidia-ptxjitcompiler.so', 'libEGL_nvidia.so.0', "libEGL_nvidia.so.${driver_ver}", - 'libGLESv1_CM_nvidia.so.1', "libGLESv1_CM_nvidia.so.${driver_ver}", 'libGLESv2_nvidia.so.2', - "libGLESv2_nvidia.so.${driver_ver}", 'libGLX_indirect.so.0', 'libGLX_nvidia.so.0', - "libGLX_nvidia.so.${driver_ver}", "libnvidia-cbl.so.${driver_ver}", 'libnvidia-cfg.so.1', - "libnvidia-cfg.so.${driver_ver}", "libnvidia-eglcore.so.${driver_ver}", "libnvidia-glcore.so.${driver_ver}", - "libnvidia-glsi.so.${driver_ver}", "libnvidia-glvkspirv.so.${driver_ver}", "libnvidia-rtcore.so.${driver_ver}", - "libnvidia-tls.so.${driver_ver}", 'libnvoptix.so.1', "libnvoptix.so.${driver_ver}"] - - $nvidia_libs.each |String $lib| { - file { "/usr/lib64/nvidia/${lib}": - ensure => link, - target => "/usr/lib64/${lib}", - seltype => 'lib_t' + $nvidia_libs_vers.each |String $lib| { + file { "/usr/lib64/nvidia/${lib}": + ensure => link, + target => "/usr/lib64/${lib}", + seltype => 'lib_t' + } + } } } } class profile::gpu::install::passthrough(Array[String] $packages) { + $cuda_ver = $::facts['nvidia_cuda_version'] $os = "rhel${::facts['os']['release']['major']}" $arch = $::facts['os']['architecture'] @@ -92,10 +153,6 @@ source => "http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/${repo_name}-${cuda_ver}.${arch}.rpm" } - ensure_packages(['dkms'], { - 'require' => Yumrepo['epel'] - }) - package { $packages: ensure => 'installed', require => [Package['cuda-repo'], Package['dkms']] @@ -118,33 +175,75 @@ } } -class profile::gpu::install::vgpu { - $os = $::facts['os']['release']['major'] - $repo_name = 'arbutus-cloud-vgpu-repo.noarch' - package { 'arbutus-cloud-vgpu-repo': - ensure => 'installed', - provider => 'rpm', - name => $repo_name, - source => "http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el${os}.noarch.rpm", +class profile::gpu::install::vgpu( + Enum['rpm', 'bin', 'none'] $installer = 'none', +) +{ + if $installer == 'rpm' { + include profile::gpu::install::vgpu::rpm + } elsif $installer == 'bin' { + # install from binary installer + include profile::gpu::install::vgpu::bin } +} - package { ['nvidia-vgpu-kmod', 'nvidia-vgpu-gridd', 'nvidia-vgpu-tools']: - ensure => 'installed', +class profile::gpu::install::vgpu::rpm( + String $source, + Array[String] $packages, +) +{ + $source_pkg_name = split(split($source, '[/]')[-1], '[.]')[0] + package { 'vgpu-repo': + ensure => 'latest', + provider => 'rpm', + name => $source_pkg_name, + source => $source, + } + + package { $packages: + ensure => 'installed', + require => [ + Yumrepo['epel'], + Package['vgpu-repo'], + ] + } + + # The device files/dev/nvidia* are normally created by nvidia-modprobe + # If the permissions of nvidia-modprobe exclude setuid, some device files + # will be missing. + # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications + -> file { '/usr/bin/nvidia-modprobe': + ensure => present, + mode => '4755', + owner => 'root', + group => 'root', + } +} + +class profile::gpu::install::vgpu::bin( + String $source, + String $gridd_source, +) +{ + exec { 'vgpu-driver-install-bin': + command => "curl -L ${source} -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run --ui=none --no-questions --disable-nouveau && rm /tmp/NVIDIA-driver.run", + path => ['/bin', '/usr/bin', '/sbin','/usr/sbin'], + creates => [ + '/usr/bin/nvidia-smi', + '/usr/bin/nvidia-modprobe', + ], + timeout => 300, require => [ - Yumrepo['epel'], - Package['arbutus-cloud-vgpu-repo'], + Package['kernel-devel'], + Package['dkms'], ] } - # The device files/dev/nvidia* are normally created by nvidia-modprobe - # If the permissions of nvidia-modprobe exclude setuid, some device files - # will be missing. - # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications - file { '/usr/bin/nvidia-modprobe': - ensure => present, - mode => '4755', - owner => 'root', - group => 'root', - require => Package['nvidia-vgpu-tools'], + file { '/etc/nvidia/gridd.conf': + ensure => present, + mode => '0644', + owner => 'root', + group => 'root', + source => $gridd_source, } }