From 7cd727fb0fa81e7a9c56c2fb39179ccec6d96886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 09:46:52 -0500 Subject: [PATCH 01/19] Add cloud provide and region to static fact template --- data/cloud/openstack/arbutus.yaml | 6 +++ site/profile/manifests/gpu.pp | 66 +++++++++++++++---------------- 2 files changed, 37 insertions(+), 35 deletions(-) create mode 100644 data/cloud/openstack/arbutus.yaml diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml new file mode 100644 index 000000000..528be5b45 --- /dev/null +++ b/data/cloud/openstack/arbutus.yaml @@ -0,0 +1,6 @@ +profile::gpu::install::vgpu::source_type: rpm +profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el${os}.noarch.rpm +profile::gpu::install::vgpu::packages: + - nvidia-vgpu-kmod + - nvidia-vgpu-gridd + - nvidia-vgpu-tools \ No newline at end of file diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 8d1e5f76f..4d1e62a6c 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -18,16 +18,8 @@ class profile::gpu::install { if ! $facts['nvidia_grid_vgpu'] { require profile::gpu::install::passthrough - $dkms_requirements = [ - Package['kernel-devel'], - Package['kmod-nvidia-latest-dkms'] - ] } else { require profile::gpu::install::vgpu - $dkms_requirements = [ - Package['kernel-devel'], - Package['nvidia-vgpu-kmod'] - ] } ensure_packages(['kernel-devel'], {ensure => 'installed'}) @@ -36,7 +28,7 @@ path => ['/usr/bin', '/usr/sbin'], onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', timeout => 0, - require => $dkms_requirements, + require => Package['kernel-devel'], } kmod::load { [ @@ -118,33 +110,37 @@ } } -class profile::gpu::install::vgpu { - $os = $::facts['os']['release']['major'] - $repo_name = 'arbutus-cloud-vgpu-repo.noarch' - package { 'arbutus-cloud-vgpu-repo': - ensure => 'installed', - provider => 'rpm', - name => $repo_name, - source => "http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el${os}.noarch.rpm", - } +class profile::gpu::install::vgpu( + Enum['rpm', 'installer'] $source_type, + String $source, + List[String] $packages = [], +) +{ + if $source_type == 'rpm' { + package { 'vgpu-repo': + ensure => 'installed', + provider => 'rpm', + name => 'vgpu-repo.noarch', + source => $source, + } - package { ['nvidia-vgpu-kmod', 'nvidia-vgpu-gridd', 'nvidia-vgpu-tools']: - ensure => 'installed', - require => [ - Yumrepo['epel'], - Package['arbutus-cloud-vgpu-repo'], - ] - } + package { $packages: + ensure => 'installed', + require => [ + Yumrepo['epel'], + Package['vgpu-repo'], + ] + } - # The device files/dev/nvidia* are normally created by nvidia-modprobe - # If the permissions of nvidia-modprobe exclude setuid, some device files - # will be missing. - # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications - file { '/usr/bin/nvidia-modprobe': - ensure => present, - mode => '4755', - owner => 'root', - group => 'root', - require => Package['nvidia-vgpu-tools'], + # The device files/dev/nvidia* are normally created by nvidia-modprobe + # If the permissions of nvidia-modprobe exclude setuid, some device files + # will be missing. + # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications + -> file { '/usr/bin/nvidia-modprobe': + ensure => present, + mode => '4755', + owner => 'root', + group => 'root', + } } } From 15fee35ce268decda8c0067042a9e274acfc8fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 11:36:48 -0500 Subject: [PATCH 02/19] Replace List by Array --- site/profile/manifests/gpu.pp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 4d1e62a6c..c7447dd7c 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -113,7 +113,7 @@ class profile::gpu::install::vgpu( Enum['rpm', 'installer'] $source_type, String $source, - List[String] $packages = [], + Array[String] $packages = [], ) { if $source_type == 'rpm' { From 50f490ca9f680d8af5d2202ed96c160e2039f815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 12:12:31 -0500 Subject: [PATCH 03/19] Fix arbutus hieradata --- data/cloud/openstack/arbutus.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml index 528be5b45..741dc1eca 100644 --- a/data/cloud/openstack/arbutus.yaml +++ b/data/cloud/openstack/arbutus.yaml @@ -1,5 +1,5 @@ profile::gpu::install::vgpu::source_type: rpm -profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el${os}.noarch.rpm +profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{hiera('facts::os::elease::major)}.noarch.rpm profile::gpu::install::vgpu::packages: - nvidia-vgpu-kmod - nvidia-vgpu-gridd From 410ebcec0fbb1c39210848a1961a5bcbdde1050b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 12:14:31 -0500 Subject: [PATCH 04/19] Add missing quote --- data/cloud/openstack/arbutus.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml index 741dc1eca..0b5a01f9d 100644 --- a/data/cloud/openstack/arbutus.yaml +++ b/data/cloud/openstack/arbutus.yaml @@ -1,5 +1,5 @@ profile::gpu::install::vgpu::source_type: rpm -profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{hiera('facts::os::elease::major)}.noarch.rpm +profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{hiera('facts::os::elease::major')}.noarch.rpm profile::gpu::install::vgpu::packages: - nvidia-vgpu-kmod - nvidia-vgpu-gridd From 7650a48db31538aa7bd5418ccb078c60bdb9b362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 12:40:47 -0500 Subject: [PATCH 05/19] Fix source for vgpu repo in arbutus --- data/cloud/openstack/arbutus.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml index 0b5a01f9d..30dc5dbe8 100644 --- a/data/cloud/openstack/arbutus.yaml +++ b/data/cloud/openstack/arbutus.yaml @@ -1,5 +1,5 @@ profile::gpu::install::vgpu::source_type: rpm -profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{hiera('facts::os::elease::major')}.noarch.rpm +profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{facts.os.release.major}.noarch.rpm profile::gpu::install::vgpu::packages: - nvidia-vgpu-kmod - nvidia-vgpu-gridd From 2ee65634ca85d69963661e76d29f465551c9d72c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 13:34:19 -0500 Subject: [PATCH 06/19] Decompose vgpu install --- data/cloud/openstack/arbutus.yaml | 6 +++--- site/profile/manifests/gpu.pp | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/data/cloud/openstack/arbutus.yaml b/data/cloud/openstack/arbutus.yaml index 30dc5dbe8..76f917c9e 100644 --- a/data/cloud/openstack/arbutus.yaml +++ b/data/cloud/openstack/arbutus.yaml @@ -1,6 +1,6 @@ -profile::gpu::install::vgpu::source_type: rpm -profile::gpu::install::vgpu::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{facts.os.release.major}.noarch.rpm -profile::gpu::install::vgpu::packages: +profile::gpu::install::vgpu::installer: rpm +profile::gpu::install::vgpu::rpm::source: http://repo.arbutus.cloud.computecanada.ca/pulp/repos/centos/arbutus-cloud-vgpu-repo.el%{facts.os.release.major}.noarch.rpm +profile::gpu::install::vgpu::rpm::packages: - nvidia-vgpu-kmod - nvidia-vgpu-gridd - nvidia-vgpu-tools \ No newline at end of file diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index c7447dd7c..b554988ea 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -111,12 +111,21 @@ } class profile::gpu::install::vgpu( - Enum['rpm', 'installer'] $source_type, + Enum['rpm', 'bin', 'none'] $installer = 'none', +) +{ + if $installer == 'rpm' { + include profile::gpu::install::vgpu::rpm + } elsif $installer == 'bin' { + # install from binary installer + } +} + +class profile::gpu::install::vgpu::rpm( String $source, - Array[String] $packages = [], + Array[String] $packages, ) { - if $source_type == 'rpm' { package { 'vgpu-repo': ensure => 'installed', provider => 'rpm', @@ -142,5 +151,4 @@ owner => 'root', group => 'root', } - } } From 696a14ed62b2a649d92ac7c4931880badffa6581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 14:25:57 -0500 Subject: [PATCH 07/19] Move dkms dependency up --- site/profile/manifests/gpu.pp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index b554988ea..f18f51868 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -23,12 +23,18 @@ } ensure_packages(['kernel-devel'], {ensure => 'installed'}) + ensure_packages(['dkms'], { + 'require' => Yumrepo['epel'] + }) exec { 'dkms autoinstall': path => ['/usr/bin', '/usr/sbin'], onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', timeout => 0, - require => Package['kernel-devel'], + require => [ + Package['kernel-devel'], + Package['dkms'] + ] } kmod::load { [ @@ -84,10 +90,6 @@ source => "http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/${repo_name}-${cuda_ver}.${arch}.rpm" } - ensure_packages(['dkms'], { - 'require' => Yumrepo['epel'] - }) - package { $packages: ensure => 'installed', require => [Package['cuda-repo'], Package['dkms']] From c4031537bcb587e7396b14d0f1832120ff065bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 15:37:17 -0500 Subject: [PATCH 08/19] Add class to install nvidia vgpu driver from bin installer --- site/profile/manifests/gpu.pp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index f18f51868..f8e43b7fe 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -16,6 +16,8 @@ } class profile::gpu::install { + ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' }) + if ! $facts['nvidia_grid_vgpu'] { require profile::gpu::install::passthrough } else { @@ -154,3 +156,27 @@ group => 'root', } } + +class profile::gpu::install::vgpu::bin( + String $source, + String $gridd_source, +) +{ + exec { 'vgpu-driver-install-bin': + command => "curl -L ${source} -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run --ui=none --no-questions --disable-nouveau; rm /tmp/NVIDIA-driver.run", + path => ['/bin', '/usr/bin', '/sbin','/usr/sbin'], + creates => [ + '/usr/bin/nvidia-smi', + '/usr/bin/nvidia-modprobe', + ], + timeout => 300, + } + + file { '/etc/nvidia/gridd.conf': + ensure => present, + mode => '0644', + owner => 'root', + group => 'root', + source => $gridd_source, + } +} From 473b4635e1ac99728cccfb5b6356a3d1831148fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 9 Feb 2021 15:38:50 -0500 Subject: [PATCH 09/19] Add hieradata for openstack jusuf-cloud --- data/cloud/openstack/jusuf-cloud.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 data/cloud/openstack/jusuf-cloud.yaml diff --git a/data/cloud/openstack/jusuf-cloud.yaml b/data/cloud/openstack/jusuf-cloud.yaml new file mode 100644 index 000000000..0164bea10 --- /dev/null +++ b/data/cloud/openstack/jusuf-cloud.yaml @@ -0,0 +1,3 @@ +profile::gpu::install::vgpu::installer: bin +profile::gpu::install::vgpu::bin::source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/NVIDIA-Driver.latest +profile::gpu::install::vgpu::bin::gridd_conf: https://hpsrepo.fz-juelich.de/jusuf/nvidia/gridd.conf \ No newline at end of file From ec21ef35cc3b0b0c562f32165c7caaeeda627d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 10 Feb 2021 08:35:05 -0500 Subject: [PATCH 10/19] Fix vgpu package name --- site/profile/manifests/gpu.pp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index f8e43b7fe..1c3ebd01a 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -130,10 +130,11 @@ Array[String] $packages, ) { + $source_pkg_name = split(split($source, '/')[-1], '.')[0] package { 'vgpu-repo': - ensure => 'installed', + ensure => 'latest', provider => 'rpm', - name => 'vgpu-repo.noarch', + name => $source_pkg_name, source => $source, } From 7d53a367aa22abf5c7382cd6ac286f17a8d464f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 10 Feb 2021 08:50:42 -0500 Subject: [PATCH 11/19] Fix split command --- site/profile/manifests/gpu.pp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 1c3ebd01a..66eb2ae4d 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -130,7 +130,7 @@ Array[String] $packages, ) { - $source_pkg_name = split(split($source, '/')[-1], '.')[0] + $source_pkg_name = split(split($source, '[/]')[-1], '[.]')[0] package { 'vgpu-repo': ensure => 'latest', provider => 'rpm', From 9fca4a350f69c7ebafa034b4d43f05128e11d48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 10 Feb 2021 10:24:59 -0500 Subject: [PATCH 12/19] Add missing include for vgpu and bin --- site/profile/manifests/gpu.pp | 1 + 1 file changed, 1 insertion(+) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 66eb2ae4d..02165172a 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -122,6 +122,7 @@ include profile::gpu::install::vgpu::rpm } elsif $installer == 'bin' { # install from binary installer + include profile::gpu::install::vgpu::bin } } From 60424786dfe84b023c43c022e6a5e1ed9e1fdcb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 10 Feb 2021 11:00:48 -0500 Subject: [PATCH 13/19] Fix jusuf-cloud gridd source key in hieradata --- data/cloud/openstack/jusuf-cloud.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cloud/openstack/jusuf-cloud.yaml b/data/cloud/openstack/jusuf-cloud.yaml index 0164bea10..a0d6fb46d 100644 --- a/data/cloud/openstack/jusuf-cloud.yaml +++ b/data/cloud/openstack/jusuf-cloud.yaml @@ -1,3 +1,3 @@ profile::gpu::install::vgpu::installer: bin profile::gpu::install::vgpu::bin::source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/NVIDIA-Driver.latest -profile::gpu::install::vgpu::bin::gridd_conf: https://hpsrepo.fz-juelich.de/jusuf/nvidia/gridd.conf \ No newline at end of file +profile::gpu::install::vgpu::bin::gridd_source: https://hpsrepo.fz-juelich.de/jusuf/nvidia/gridd.conf \ No newline at end of file From 4ae44db97363a85a21b0fd037135e363c2bed37e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Wed, 10 Feb 2021 12:00:28 -0500 Subject: [PATCH 14/19] Move gpu driver dependencies higher up --- site/profile/manifests/gpu.pp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 02165172a..2751d5548 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -15,8 +15,15 @@ } } -class profile::gpu::install { +class profile::gpu::install::deps { ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' }) + ensure_packages(['kernel-devel'], {ensure => 'installed'}) + ensure_packages(['dkms'], { + 'require' => Yumrepo['epel'] + }) +} + +class profile::gpu::install { if ! $facts['nvidia_grid_vgpu'] { require profile::gpu::install::passthrough @@ -24,11 +31,6 @@ require profile::gpu::install::vgpu } - ensure_packages(['kernel-devel'], {ensure => 'installed'}) - ensure_packages(['dkms'], { - 'require' => Yumrepo['epel'] - }) - exec { 'dkms autoinstall': path => ['/usr/bin', '/usr/sbin'], onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', @@ -81,6 +83,8 @@ } class profile::gpu::install::passthrough(Array[String] $packages) { + require profile::gpu::install::deps + $cuda_ver = $::facts['nvidia_cuda_version'] $os = "rhel${::facts['os']['release']['major']}" $arch = $::facts['os']['architecture'] @@ -118,6 +122,8 @@ Enum['rpm', 'bin', 'none'] $installer = 'none', ) { + require profile::gpu::install::deps + if $installer == 'rpm' { include profile::gpu::install::vgpu::rpm } elsif $installer == 'bin' { From d763b079dc89011a8d0d35d54d21a40d3f3920ca Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 10 Feb 2021 17:43:56 +0100 Subject: [PATCH 15/19] Make sure installer succeeds --- site/profile/manifests/gpu.pp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 2751d5548..d077eac51 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -171,7 +171,7 @@ ) { exec { 'vgpu-driver-install-bin': - command => "curl -L ${source} -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run --ui=none --no-questions --disable-nouveau; rm /tmp/NVIDIA-driver.run", + command => "curl -L ${source} -o /tmp/NVIDIA-driver.run && sh /tmp/NVIDIA-driver.run --ui=none --no-questions --disable-nouveau && rm /tmp/NVIDIA-driver.run", path => ['/bin', '/usr/bin', '/sbin','/usr/sbin'], creates => [ '/usr/bin/nvidia-smi', From bc36283d2c925b323bb79c0d954aa147633cf827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Fri, 12 Feb 2021 08:45:10 -0500 Subject: [PATCH 16/19] Add explicit requirement on dkms and kernel-devel --- site/profile/manifests/gpu.pp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index d077eac51..f152b2478 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -15,15 +15,12 @@ } } -class profile::gpu::install::deps { +class profile::gpu::install { ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' }) ensure_packages(['kernel-devel'], {ensure => 'installed'}) ensure_packages(['dkms'], { 'require' => Yumrepo['epel'] }) -} - -class profile::gpu::install { if ! $facts['nvidia_grid_vgpu'] { require profile::gpu::install::passthrough @@ -83,7 +80,6 @@ } class profile::gpu::install::passthrough(Array[String] $packages) { - require profile::gpu::install::deps $cuda_ver = $::facts['nvidia_cuda_version'] $os = "rhel${::facts['os']['release']['major']}" @@ -122,8 +118,6 @@ Enum['rpm', 'bin', 'none'] $installer = 'none', ) { - require profile::gpu::install::deps - if $installer == 'rpm' { include profile::gpu::install::vgpu::rpm } elsif $installer == 'bin' { @@ -178,6 +172,10 @@ '/usr/bin/nvidia-modprobe', ], timeout => 300, + require => [ + Package['kernel-devel'], + Package['dkms'], + ] } file { '/etc/nvidia/gridd.conf': From 8664e648d0072da2a7ee0aecf56dd3b37b3646c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 16 Feb 2021 11:28:31 -0500 Subject: [PATCH 17/19] Do not execute dkms autoinstall with vgpu binary --- site/profile/manifests/gpu.pp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index f152b2478..0a458ad55 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -28,14 +28,21 @@ require profile::gpu::install::vgpu } - exec { 'dkms autoinstall': - path => ['/usr/bin', '/usr/sbin'], - onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', - timeout => 0, - require => [ - Package['kernel-devel'], - Package['dkms'] - ] + # Binary installer do not build drivers with DKMS + $installer = lookup('profile::gpu::install::vgpu::installer') + if ! $facts['nvidia_grid_vgpu'] or $installer != 'bin' { + exec { 'dkms autoinstall': + path => ['/usr/bin', '/usr/sbin'], + onlyif => 'dkms status | grep -v -q \'nvidia.*installed\'', + timeout => 0, + require => [ + Package['kernel-devel'], + Package['dkms'] + ] + } + $kmod_require = [Exec['dkms autoinstall']] + } else { + $kmod_require = [] } kmod::load { [ @@ -44,7 +51,7 @@ 'nvidia_modeset', 'nvidia_uvm' ]: - require => Exec['dkms autoinstall'] + require => $kmod_require } file { '/usr/lib64/nvidia': From 54ea3ec09b6868be6071040bae945724a111cccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 23 Feb 2021 11:08:50 -0500 Subject: [PATCH 18/19] Refactor nvidia symlink creation Avoid creation of broken symlink when dealing with VGPU drivers. --- site/profile/facts.d/nvidia_driver_vers.sh | 17 +---- site/profile/manifests/gpu.pp | 84 +++++++++++++++++----- 2 files changed, 70 insertions(+), 31 deletions(-) diff --git a/site/profile/facts.d/nvidia_driver_vers.sh b/site/profile/facts.d/nvidia_driver_vers.sh index 0f804a112..cc36a85b7 100755 --- a/site/profile/facts.d/nvidia_driver_vers.sh +++ b/site/profile/facts.d/nvidia_driver_vers.sh @@ -1,16 +1,5 @@ #!/bin/sh -PROCESSOR=$(uname -p) -VERSION="$(source /etc/os-release; echo $VERSION_ID)" -PACKAGE="cuda-drivers" -PACKAGE_REGEX="${PACKAGE}-\([0-9.]\{1,\}\)[-0-9]*\.${PROCESSOR}" -DRIVER_VERSION=$(test -f /usr/sbin/dkms && /usr/sbin/dkms status | grep -m 1 -Po 'nvidia, \K(\d+.\d+[\.]\d*)') -# If that didn't work let's try nvidia-smi -if [ -z $DRIVER_VERSION ]; then - DRIVER_VERSION=$(if [ -x "$(command -v nvidia-smi)" ]; then nvidia-smi --query-gpu=driver_version --format=csv,noheader; fi) +if [ -e /proc/driver/nvidia ]; then + DRIVER_VERSION=$(grep -m 1 -Po 'NVRM version:.* \K(\d+\.\d+\.\d+)' /proc/driver/nvidia/version) fi -if [ -z $DRIVER_VERSION ]; then - BASE_URL="http://developer.download.nvidia.com/compute/cuda/repos" - CUDA_REPO_GZ=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/repodata/repomd.xml | sed '2 s/xmlns=".*"//g' | xmllint --xpath 'string(/repomd/data[@type="primary"]/location/@href)' -) - DRIVER_VERSION=$(curl -s ${BASE_URL}/rhel${VERSION}/${PROCESSOR}/${CUDA_REPO_GZ} | gunzip | sed -n "s/^.*\"${PACKAGE_REGEX}\.rpm\".*$/\1/p" | sort -V | tail -n1) -fi -echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }" +echo "{ 'nvidia_driver_version' : '${DRIVER_VERSION}' }" \ No newline at end of file diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 0a458ad55..cac364024 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -58,24 +58,32 @@ ensure => directory } - $driver_ver = $::facts['nvidia_driver_version'] $nvidia_libs = [ - "libnvidia-ml.so.${driver_ver}", 'libnvidia-ml.so.1', 'libnvidia-fbc.so.1', - "libnvidia-fbc.so.${driver_ver}", 'libnvidia-ifr.so.1', "libnvidia-ifr.so.${driver_ver}", - 'libcuda.so', 'libcuda.so.1', "libcuda.so.${driver_ver}", "libnvcuvid.so.${driver_ver}", - 'libnvcuvid.so.1', "libnvidia-compiler.so.${driver_ver}", 'libnvidia-encode.so.1', - "libnvidia-encode.so.${driver_ver}", "libnvidia-fatbinaryloader.so.${driver_ver}", - 'libnvidia-opencl.so.1', "libnvidia-opencl.so.${driver_ver}", 'libnvidia-opticalflow.so.1', - "libnvidia-opticalflow.so.${driver_ver}", 'libnvidia-ptxjitcompiler.so.1', "libnvidia-ptxjitcompiler.so.${driver_ver}", - 'libnvcuvid.so', 'libnvidia-cfg.so', 'libnvidia-encode.so', - 'libnvidia-fbc.so', 'libnvidia-ifr.so', 'libnvidia-ml.so', - 'libnvidia-ptxjitcompiler.so', 'libEGL_nvidia.so.0', "libEGL_nvidia.so.${driver_ver}", - 'libGLESv1_CM_nvidia.so.1', "libGLESv1_CM_nvidia.so.${driver_ver}", 'libGLESv2_nvidia.so.2', - "libGLESv2_nvidia.so.${driver_ver}", 'libGLX_indirect.so.0', 'libGLX_nvidia.so.0', - "libGLX_nvidia.so.${driver_ver}", "libnvidia-cbl.so.${driver_ver}", 'libnvidia-cfg.so.1', - "libnvidia-cfg.so.${driver_ver}", "libnvidia-eglcore.so.${driver_ver}", "libnvidia-glcore.so.${driver_ver}", - "libnvidia-glsi.so.${driver_ver}", "libnvidia-glvkspirv.so.${driver_ver}", "libnvidia-rtcore.so.${driver_ver}", - "libnvidia-tls.so.${driver_ver}", 'libnvoptix.so.1', "libnvoptix.so.${driver_ver}"] + 'libcuda.so.1', + 'libcuda.so', + 'libEGL_nvidia.so.0', + 'libGLESv1_CM_nvidia.so.1', + 'libGLESv2_nvidia.so.2', + 'libGLX_indirect.so.0', + 'libGLX_nvidia.so.0', + 'libnvcuvid.so.1', + 'libnvcuvid.so', + 'libnvidia-cfg.so.1', + 'libnvidia-cfg.so', + 'libnvidia-encode.so.1', + 'libnvidia-encode.so', + 'libnvidia-fbc.so.1', + 'libnvidia-fbc.so', + 'libnvidia-ifr.so.1', + 'libnvidia-ifr.so', + 'libnvidia-ml.so.1', + 'libnvidia-ml.so', + 'libnvidia-opencl.so.1', + 'libnvidia-opticalflow.so.1', + 'libnvidia-ptxjitcompiler.so.1', + 'libnvidia-ptxjitcompiler.so', + 'libnvoptix.so.1', + ] $nvidia_libs.each |String $lib| { file { "/usr/lib64/nvidia/${lib}": @@ -84,6 +92,48 @@ seltype => 'lib_t' } } + + # WARNING : since the fact is computed before Puppet agent run, + # on a clean host, the symbolic links to the NVIDIA libraries + # that include the version number will be created on the + # second Puppet run only. + $driver_vers = $::facts['nvidia_driver_version'] + if $driver_vers != '' { + $nvidia_libs_vers = [ + "libcuda.so.${driver_vers}", + "libEGL_nvidia.so.${driver_vers}", + "libGLESv1_CM_nvidia.so.${driver_vers}", + "libGLESv2_nvidia.so.${driver_vers}", + "libGLX_nvidia.so.${driver_vers}", + "libnvcuvid.so.${driver_vers}", + "libnvidia-cbl.so.${driver_vers}", + "libnvidia-cfg.so.${driver_vers}", + "libnvidia-compiler.so.${driver_vers}", + "libnvidia-eglcore.so.${driver_vers}", + "libnvidia-encode.so.${driver_vers}", + "libnvidia-fatbinaryloader.so.${driver_vers}", + "libnvidia-fbc.so.${driver_vers}", + "libnvidia-glcore.so.${driver_vers}", + "libnvidia-glsi.so.${driver_vers}", + "libnvidia-glvkspirv.so.${driver_vers}", + "libnvidia-ifr.so.${driver_vers}", + "libnvidia-ml.so.${driver_vers}", + "libnvidia-opencl.so.${driver_vers}", + "libnvidia-opticalflow.so.${driver_vers}", + "libnvidia-ptxjitcompiler.so.${driver_vers}", + "libnvidia-rtcore.so.${driver_vers}", + "libnvidia-tls.so.${driver_vers}", + "libnvoptix.so.${driver_vers}" + ] + + $nvidia_libs_vers.each |String $lib| { + file { "/usr/lib64/nvidia/${lib}": + ensure => link, + target => "/usr/lib64/${lib}", + seltype => 'lib_t' + } + } + } } class profile::gpu::install::passthrough(Array[String] $packages) { From 9e8bbc918cad40193680d439e76be187fa4efdb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix-Antoine=20Fortin?= Date: Tue, 23 Feb 2021 14:19:49 -0500 Subject: [PATCH 19/19] Make gpu symlink location configurable --- data/software_stack/computecanada.yaml | 2 + data/software_stack/eessi.yaml | 2 + site/profile/manifests/gpu.pp | 146 +++++++++++++------------ 3 files changed, 79 insertions(+), 71 deletions(-) diff --git a/data/software_stack/computecanada.yaml b/data/software_stack/computecanada.yaml index 040ec6213..57fe1b01a 100644 --- a/data/software_stack/computecanada.yaml +++ b/data/software_stack/computecanada.yaml @@ -22,3 +22,5 @@ profile::squid::server::cvmfs_acl_regex: - '^(cvmfs-.*\.computecanada\.net)$' - '^(.*-cvmfs\.openhtc\.io)$' - '^(cvmfs-.*\.genap\.ca)$' + +profile::gpu::install::lib_symlink_path: '/usr/lib64/nvidia' diff --git a/data/software_stack/eessi.yaml b/data/software_stack/eessi.yaml index fba8f6860..ee1586ed4 100644 --- a/data/software_stack/eessi.yaml +++ b/data/software_stack/eessi.yaml @@ -24,3 +24,5 @@ profile::cvmfs::client::lmod_default_modules: profile::squid::server::cvmfs_acl_regex: - '^(.*\.eessi-hpc\.org)$' + +profile::gpu::install::lib_symlink_path: '/opt/eessi/lib' diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index cac364024..a911e3207 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -15,7 +15,9 @@ } } -class profile::gpu::install { +class profile::gpu::install ( + String $lib_symlink_path = undef +) { ensure_resource('file', '/etc/nvidia', {'ensure' => 'directory' }) ensure_packages(['kernel-devel'], {ensure => 'installed'}) ensure_packages(['dkms'], { @@ -53,86 +55,88 @@ ]: require => $kmod_require } - - file { '/usr/lib64/nvidia': - ensure => directory - } - - $nvidia_libs = [ - 'libcuda.so.1', - 'libcuda.so', - 'libEGL_nvidia.so.0', - 'libGLESv1_CM_nvidia.so.1', - 'libGLESv2_nvidia.so.2', - 'libGLX_indirect.so.0', - 'libGLX_nvidia.so.0', - 'libnvcuvid.so.1', - 'libnvcuvid.so', - 'libnvidia-cfg.so.1', - 'libnvidia-cfg.so', - 'libnvidia-encode.so.1', - 'libnvidia-encode.so', - 'libnvidia-fbc.so.1', - 'libnvidia-fbc.so', - 'libnvidia-ifr.so.1', - 'libnvidia-ifr.so', - 'libnvidia-ml.so.1', - 'libnvidia-ml.so', - 'libnvidia-opencl.so.1', - 'libnvidia-opticalflow.so.1', - 'libnvidia-ptxjitcompiler.so.1', - 'libnvidia-ptxjitcompiler.so', - 'libnvoptix.so.1', - ] - - $nvidia_libs.each |String $lib| { - file { "/usr/lib64/nvidia/${lib}": - ensure => link, - target => "/usr/lib64/${lib}", - seltype => 'lib_t' + if $lib_symlink_path { + $lib_symlink_path_split = split($lib_symlink_path, '/') + $lib_symlink_path_split[1,-1].each |Integer $index, String $value| { + ensure_resource('file', join($lib_symlink_path_split[0, $index+2], '/'), {'ensure' => 'directory'}) } - } - # WARNING : since the fact is computed before Puppet agent run, - # on a clean host, the symbolic links to the NVIDIA libraries - # that include the version number will be created on the - # second Puppet run only. - $driver_vers = $::facts['nvidia_driver_version'] - if $driver_vers != '' { - $nvidia_libs_vers = [ - "libcuda.so.${driver_vers}", - "libEGL_nvidia.so.${driver_vers}", - "libGLESv1_CM_nvidia.so.${driver_vers}", - "libGLESv2_nvidia.so.${driver_vers}", - "libGLX_nvidia.so.${driver_vers}", - "libnvcuvid.so.${driver_vers}", - "libnvidia-cbl.so.${driver_vers}", - "libnvidia-cfg.so.${driver_vers}", - "libnvidia-compiler.so.${driver_vers}", - "libnvidia-eglcore.so.${driver_vers}", - "libnvidia-encode.so.${driver_vers}", - "libnvidia-fatbinaryloader.so.${driver_vers}", - "libnvidia-fbc.so.${driver_vers}", - "libnvidia-glcore.so.${driver_vers}", - "libnvidia-glsi.so.${driver_vers}", - "libnvidia-glvkspirv.so.${driver_vers}", - "libnvidia-ifr.so.${driver_vers}", - "libnvidia-ml.so.${driver_vers}", - "libnvidia-opencl.so.${driver_vers}", - "libnvidia-opticalflow.so.${driver_vers}", - "libnvidia-ptxjitcompiler.so.${driver_vers}", - "libnvidia-rtcore.so.${driver_vers}", - "libnvidia-tls.so.${driver_vers}", - "libnvoptix.so.${driver_vers}" + $nvidia_libs = [ + 'libcuda.so.1', + 'libcuda.so', + 'libEGL_nvidia.so.0', + 'libGLESv1_CM_nvidia.so.1', + 'libGLESv2_nvidia.so.2', + 'libGLX_indirect.so.0', + 'libGLX_nvidia.so.0', + 'libnvcuvid.so.1', + 'libnvcuvid.so', + 'libnvidia-cfg.so.1', + 'libnvidia-cfg.so', + 'libnvidia-encode.so.1', + 'libnvidia-encode.so', + 'libnvidia-fbc.so.1', + 'libnvidia-fbc.so', + 'libnvidia-ifr.so.1', + 'libnvidia-ifr.so', + 'libnvidia-ml.so.1', + 'libnvidia-ml.so', + 'libnvidia-opencl.so.1', + 'libnvidia-opticalflow.so.1', + 'libnvidia-ptxjitcompiler.so.1', + 'libnvidia-ptxjitcompiler.so', + 'libnvoptix.so.1', ] - $nvidia_libs_vers.each |String $lib| { + $nvidia_libs.each |String $lib| { file { "/usr/lib64/nvidia/${lib}": ensure => link, target => "/usr/lib64/${lib}", seltype => 'lib_t' } } + + # WARNING : since the fact is computed before Puppet agent run, + # on a clean host, the symbolic links to the NVIDIA libraries + # that include the version number will be created on the + # second Puppet run only. + $driver_vers = $::facts['nvidia_driver_version'] + if $driver_vers != '' { + $nvidia_libs_vers = [ + "libcuda.so.${driver_vers}", + "libEGL_nvidia.so.${driver_vers}", + "libGLESv1_CM_nvidia.so.${driver_vers}", + "libGLESv2_nvidia.so.${driver_vers}", + "libGLX_nvidia.so.${driver_vers}", + "libnvcuvid.so.${driver_vers}", + "libnvidia-cbl.so.${driver_vers}", + "libnvidia-cfg.so.${driver_vers}", + "libnvidia-compiler.so.${driver_vers}", + "libnvidia-eglcore.so.${driver_vers}", + "libnvidia-encode.so.${driver_vers}", + "libnvidia-fatbinaryloader.so.${driver_vers}", + "libnvidia-fbc.so.${driver_vers}", + "libnvidia-glcore.so.${driver_vers}", + "libnvidia-glsi.so.${driver_vers}", + "libnvidia-glvkspirv.so.${driver_vers}", + "libnvidia-ifr.so.${driver_vers}", + "libnvidia-ml.so.${driver_vers}", + "libnvidia-opencl.so.${driver_vers}", + "libnvidia-opticalflow.so.${driver_vers}", + "libnvidia-ptxjitcompiler.so.${driver_vers}", + "libnvidia-rtcore.so.${driver_vers}", + "libnvidia-tls.so.${driver_vers}", + "libnvoptix.so.${driver_vers}" + ] + + $nvidia_libs_vers.each |String $lib| { + file { "/usr/lib64/nvidia/${lib}": + ensure => link, + target => "/usr/lib64/${lib}", + seltype => 'lib_t' + } + } + } } }