Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize VGPU drivers installation #93

Merged
merged 19 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions data/cloud/openstack/arbutus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
profile::gpu::install::vgpu::installer: rpm
profile::gpu::install::vgpu::rpm::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{facts.os.release.major}.noarch.rpm
profile::gpu::install::vgpu::rpm::packages:
- nvidia-vgpu-kmod
- nvidia-vgpu-gridd
- nvidia-vgpu-tools
3 changes: 3 additions & 0 deletions data/cloud/openstack/jusuf-cloud.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
profile::gpu::install::vgpu::installer: bin
profile::gpu::install::vgpu::bin::source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/NVIDIA-Driver.latest
profile::gpu::install::vgpu::bin::gridd_source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/gridd.conf
2 changes: 2 additions & 0 deletions data/software_stack/computecanada.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ profile::squid::server::cvmfs_acl_regex:
- '^(cvmfs-.*\.computecanada\.net)$'
- '^(.*-cvmfs\.openhtc\.io)$'
- '^(cvmfs-.*\.genap\.ca)$'

profile::gpu::install::lib_symlink_path: '/usr/lib64/nvidia'
2 changes: 2 additions & 0 deletions data/software_stack/eessi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ profile::cvmfs::client::lmod_default_modules:

profile::squid::server::cvmfs_acl_regex:
- '^(.*\.eessi-hpc\.org)$'

profile::gpu::install::lib_symlink_path: '/opt/eessi/lib'
17 changes: 3 additions & 14 deletions site/profile/facts.d/nvidia_driver_vers.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
#!/bin/sh
PROCESSOR=$(uname -p)
VERSION="$(source /etc/os-release; echo $VERSION_ID)"
PACKAGE="cuda-drivers"
PACKAGE_REGEX="${PACKAGE}-\([0-9.]\{1,\}\)[-0-9]*\.${PROCESSOR}"
DRIVER_VERSION=$(test -f /usr/sbin/dkms && /usr/sbin/dkms status | grep -m 1 -Po 'nvidia, \K(\d+.\d+[\.]\d*)')
# If that didn't work let's try nvidia-smi
if [ -z $DRIVER_VERSION ]; then
DRIVER_VERSION=$(if [ -x "$(command -v nvidia-smi)" ]; then nvidia-smi --query-gpu=driver_version --format=csv,noheader; fi)
if [ -e /proc/driver/nvidia ]; then
DRIVER_VERSION=$(grep -m 1 -Po 'NVRM version:.* \K(\d+\.\d+\.\d+)' /proc/driver/nvidia/version)
fi
if [ -z $DRIVER_VERSION ]; then
BASE_URL="http://developer.download.nvidia.com/compute/cuda/repos"
CUDA_REPO_GZ=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/repodata/repomd.xml | sed '2 s/xmlns=".*"//g' | xmllint --xpath 'string(/repomd/data[@type="primary"]/location/@href)' -)
DRIVER_VERSION=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/${CUDA_REPO_GZ} | gunzip | sed -n "s/^.*\"${PACKAGE_REGEX}\.rpm\".*$/\1/p" | sort -V | tail -n1)
fi
echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }"
echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }"
239 changes: 169 additions & 70 deletions site/profile/manifests/gpu.pp
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,36 @@
}
}

class profile::gpu::install {
class profile::gpu::install (
String $lib_symlink_path = undef
) {
ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' })
ensure_packages(['kernel-devel'], {ensure => 'installed'})
ensure_packages(['dkms'], {
'require' => Yumrepo['epel']
})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's something wrong with this because it is not catching this requirement:

Feb 12 13:08:53 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Deps/File[/etc/nvidia]/ensure) created
Feb 12 13:09:07 gpu-node3 yum[2001]: Installed: kernel-devel-3.10.0-1160.15.2.el7.x86_64
Feb 12 13:09:07 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Deps/Package[kernel-devel]/ensure) created
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns)   % Total    % Received % Xferd  Average 
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns)                                  Dload  U
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: [244B blob data]
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Verifying archive integrity... OK
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Uncompressing NVIDIA Accelerated Graphics
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) ERROR: Unable to find the development too
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) ERROR: Installation has failed.  Please s
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Welcome to the NVIDIA Software Installer 
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Detected 2 CPUs online; setting concurren
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Tagging shared libraries with chcon -t te
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) Installing NVIDIA driver version 450.89.
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) For some distributions, Nouveau can be di
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) One or more modprobe configuration files 
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: 'curl -L https://hpsrepo.fz-juelich.de/jusuf/nvidia/NVIDIA-Driver.latest -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run
Feb 12 13:09:31 gpu-node3 puppet-agent[1084]: (/Stage[main]/Profile::Gpu::Install::Vgpu::Bin/Exec[vgpu-driver-install-bin]/returns) change from 'notrun' to ['0'] failed: 'cu

It is installing kernel-devel but not recognising that the dkms requirement is not met.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Later I see dkms being installed (which brings in gcc), so the second time around things will succeed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strategy shift: I have added an explicit requirement on dkms and kernel-devel package on the Exec[vgpu-driver-install-bin] resource and removed the deps class.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed, that works, the drivers get installed at the right time.


if ! $facts['nvidia_grid_vgpu'] {
require profile::gpu::install::passthrough
$dkms_requirements = [
Package['kernel-devel'],
Package['kmod-nvidia-latest-dkms']
]
} else {
require profile::gpu::install::vgpu
$dkms_requirements = [
Package['kernel-devel'],
Package['nvidia-vgpu-kmod']
]
}

ensure_packages(['kernel-devel'], {ensure => 'installed'})

exec { 'dkms autoinstall':
path => ['/usr/bin', '/usr/sbin'],
onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'',
timeout => 0,
require => $dkms_requirements,
# Binary installer do not build drivers with DKMS
$installer = lookup('profile::gpu::install::vgpu::installer')
if ! $facts['nvidia_grid_vgpu'] or $installer != 'bin' {
exec { 'dkms autoinstall':
path => ['/usr/bin', '/usr/sbin'],
onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'',
timeout => 0,
require => [
Package['kernel-devel'],
Package['dkms']
]
}
$kmod_require = [Exec['dkms autoinstall']]
} else {
$kmod_require = []
}

kmod::load { [
Expand All @@ -45,42 +53,95 @@
'nvidia_modeset',
'nvidia_uvm'
]:
require => Exec['dkms autoinstall']
require => $kmod_require
}
if $lib_symlink_path {
$lib_symlink_path_split = split($lib_symlink_path, '/')
$lib_symlink_path_split[1,-1].each |Integer $index, String $value| {
ensure_resource('file', join($lib_symlink_path_split[0, $index+2], '/'), {'ensure' => 'directory'})
}

file { '/usr/lib64/nvidia':
ensure => directory
}
$nvidia_libs = [
'libcuda.so.1',
'libcuda.so',
'libEGL_nvidia.so.0',
'libGLESv1_CM_nvidia.so.1',
'libGLESv2_nvidia.so.2',
'libGLX_indirect.so.0',
'libGLX_nvidia.so.0',
'libnvcuvid.so.1',
'libnvcuvid.so',
'libnvidia-cfg.so.1',
'libnvidia-cfg.so',
'libnvidia-encode.so.1',
'libnvidia-encode.so',
'libnvidia-fbc.so.1',
'libnvidia-fbc.so',
'libnvidia-ifr.so.1',
'libnvidia-ifr.so',
'libnvidia-ml.so.1',
'libnvidia-ml.so',
'libnvidia-opencl.so.1',
'libnvidia-opticalflow.so.1',
'libnvidia-ptxjitcompiler.so.1',
'libnvidia-ptxjitcompiler.so',
'libnvoptix.so.1',
]

$nvidia_libs.each |String $lib| {
file { "/usr/lib64/nvidia/${lib}":
cmd-ntrf marked this conversation as resolved.
Show resolved Hide resolved
ensure => link,
target => "/usr/lib64/${lib}",
seltype => 'lib_t'
}
}

# WARNING : since the fact is computed before Puppet agent run,
# on a clean host, the symbolic links to the NVIDIA libraries
# that include the version number will be created on the
# second Puppet run only.
$driver_vers = $::facts['nvidia_driver_version']
if $driver_vers != '' {
$nvidia_libs_vers = [
"libcuda.so.${driver_vers}",
"libEGL_nvidia.so.${driver_vers}",
"libGLESv1_CM_nvidia.so.${driver_vers}",
"libGLESv2_nvidia.so.${driver_vers}",
"libGLX_nvidia.so.${driver_vers}",
"libnvcuvid.so.${driver_vers}",
"libnvidia-cbl.so.${driver_vers}",
"libnvidia-cfg.so.${driver_vers}",
"libnvidia-compiler.so.${driver_vers}",
"libnvidia-eglcore.so.${driver_vers}",
"libnvidia-encode.so.${driver_vers}",
"libnvidia-fatbinaryloader.so.${driver_vers}",
"libnvidia-fbc.so.${driver_vers}",
"libnvidia-glcore.so.${driver_vers}",
"libnvidia-glsi.so.${driver_vers}",
"libnvidia-glvkspirv.so.${driver_vers}",
"libnvidia-ifr.so.${driver_vers}",
"libnvidia-ml.so.${driver_vers}",
"libnvidia-opencl.so.${driver_vers}",
"libnvidia-opticalflow.so.${driver_vers}",
"libnvidia-ptxjitcompiler.so.${driver_vers}",
"libnvidia-rtcore.so.${driver_vers}",
"libnvidia-tls.so.${driver_vers}",
"libnvoptix.so.${driver_vers}"
]

$driver_ver = $::facts['nvidia_driver_version']
$nvidia_libs = [
"libnvidia-ml.so.${driver_ver}", 'libnvidia-ml.so.1', 'libnvidia-fbc.so.1',
"libnvidia-fbc.so.${driver_ver}", 'libnvidia-ifr.so.1', "libnvidia-ifr.so.${driver_ver}",
'libcuda.so', 'libcuda.so.1', "libcuda.so.${driver_ver}", "libnvcuvid.so.${driver_ver}",
'libnvcuvid.so.1', "libnvidia-compiler.so.${driver_ver}", 'libnvidia-encode.so.1',
"libnvidia-encode.so.${driver_ver}", "libnvidia-fatbinaryloader.so.${driver_ver}",
'libnvidia-opencl.so.1', "libnvidia-opencl.so.${driver_ver}", 'libnvidia-opticalflow.so.1',
"libnvidia-opticalflow.so.${driver_ver}", 'libnvidia-ptxjitcompiler.so.1', "libnvidia-ptxjitcompiler.so.${driver_ver}",
'libnvcuvid.so', 'libnvidia-cfg.so', 'libnvidia-encode.so',
'libnvidia-fbc.so', 'libnvidia-ifr.so', 'libnvidia-ml.so',
'libnvidia-ptxjitcompiler.so', 'libEGL_nvidia.so.0', "libEGL_nvidia.so.${driver_ver}",
'libGLESv1_CM_nvidia.so.1', "libGLESv1_CM_nvidia.so.${driver_ver}", 'libGLESv2_nvidia.so.2',
"libGLESv2_nvidia.so.${driver_ver}", 'libGLX_indirect.so.0', 'libGLX_nvidia.so.0',
"libGLX_nvidia.so.${driver_ver}", "libnvidia-cbl.so.${driver_ver}", 'libnvidia-cfg.so.1',
"libnvidia-cfg.so.${driver_ver}", "libnvidia-eglcore.so.${driver_ver}", "libnvidia-glcore.so.${driver_ver}",
"libnvidia-glsi.so.${driver_ver}", "libnvidia-glvkspirv.so.${driver_ver}", "libnvidia-rtcore.so.${driver_ver}",
"libnvidia-tls.so.${driver_ver}", 'libnvoptix.so.1', "libnvoptix.so.${driver_ver}"]

$nvidia_libs.each |String $lib| {
file { "/usr/lib64/nvidia/${lib}":
ensure => link,
target => "/usr/lib64/${lib}",
seltype => 'lib_t'
$nvidia_libs_vers.each |String $lib| {
file { "/usr/lib64/nvidia/${lib}":
ensure => link,
target => "/usr/lib64/${lib}",
seltype => 'lib_t'
}
}
}
}
}

class profile::gpu::install::passthrough(Array[String] $packages) {

$cuda_ver = $::facts['nvidia_cuda_version']
$os = "rhel${::facts['os']['release']['major']}"
$arch = $::facts['os']['architecture']
Expand All @@ -92,10 +153,6 @@
source => "http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/${repo_name}-${cuda_ver}.${arch}.rpm"
}

ensure_packages(['dkms'], {
'require' => Yumrepo['epel']
})

package { $packages:
ensure => 'installed',
require => [Package['cuda-repo'], Package['dkms']]
Expand All @@ -118,33 +175,75 @@
}
}

class profile::gpu::install::vgpu {
$os = $::facts['os']['release']['major']
$repo_name = 'arbutus-cloud-vgpu-repo.noarch'
package { 'arbutus-cloud-vgpu-repo':
ensure => 'installed',
provider => 'rpm',
name => $repo_name,
source => "http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el${os}.noarch.rpm",
class profile::gpu::install::vgpu(
Enum['rpm', 'bin', 'none'] $installer = 'none',
)
{
if $installer == 'rpm' {
include profile::gpu::install::vgpu::rpm
} elsif $installer == 'bin' {
# install from binary installer
include profile::gpu::install::vgpu::bin
}
}

package { ['nvidia-vgpu-kmod', 'nvidia-vgpu-gridd', 'nvidia-vgpu-tools']:
ensure => 'installed',
class profile::gpu::install::vgpu::rpm(
String $source,
Array[String] $packages,
)
{
$source_pkg_name = split(split($source, '[/]')[-1], '[.]')[0]
package { 'vgpu-repo':
ensure => 'latest',
provider => 'rpm',
name => $source_pkg_name,
source => $source,
}

package { $packages:
ensure => 'installed',
require => [
Yumrepo['epel'],
Package['vgpu-repo'],
]
}

# The device files/dev/nvidia* are normally created by nvidia-modprobe
# If the permissions of nvidia-modprobe exclude setuid, some device files
# will be missing.
# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications
-> file { '/usr/bin/nvidia-modprobe':
ensure => present,
mode => '4755',
owner => 'root',
group => 'root',
}
}

class profile::gpu::install::vgpu::bin(
String $source,
String $gridd_source,
)
{
exec { 'vgpu-driver-install-bin':
command => "curl -L ${source} -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run --ui=none --no-questions --disable-nouveau && rm /tmp/NVIDIA-driver.run",
path => ['/bin', '/usr/bin', '/sbin','/usr/sbin'],
creates => [
'/usr/bin/nvidia-smi',
'/usr/bin/nvidia-modprobe',
],
timeout => 300,
require => [
Yumrepo['epel'],
Package['arbutus-cloud-vgpu-repo'],
Package['kernel-devel'],
Package['dkms'],
]
}

# The device files/dev/nvidia* are normally created by nvidia-modprobe
# If the permissions of nvidia-modprobe exclude setuid, some device files
# will be missing.
# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications
file { '/usr/bin/nvidia-modprobe':
ensure => present,
mode => '4755',
owner => 'root',
group => 'root',
require => Package['nvidia-vgpu-tools'],
file { '/etc/nvidia/gridd.conf':
ensure => present,
mode => '0644',
owner => 'root',
group => 'root',
source => $gridd_source,
}
}