From 4ff57ba237c9d6775b213ef25e4aed0d6031eaf3 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 17:31:18 +0000 Subject: [PATCH 1/4] cleanup the code * Make the code consistant between classes. * Improve docstrings, descriptions, and variable names. * Pin ops to 1.5.4 for compatibility with python3.6.x. --- dispatch | 2 +- metadata.yaml | 5 +- requirements.txt | 2 +- src/nvidia_ops_manager.py | 125 +++++++++++++++++--------------------- 4 files changed, 61 insertions(+), 73 deletions(-) diff --git a/dispatch b/dispatch index d04334c..c04f605 100755 --- a/dispatch +++ b/dispatch @@ -1,5 +1,5 @@ #!/bin/bash -# This hook installs the centos dependencies needed to run the charm code. +# Install the centos dependencies needed to run the charm code. set -e diff --git a/metadata.yaml b/metadata.yaml index 2d63edd..a3ab7b4 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -10,9 +10,8 @@ maintainers: - OmniVector Solutions description: | - The Nvidia Operator Charm will install nvidia drivers - to the underlying operating system upon being related to a primary - charm via the juju-info integration. + This charm will install nvidia drivers for centos7 and ubuntu + 20.04 & 22.04 operating systems. tags: - hpc diff --git a/requirements.txt b/requirements.txt index 73c1f1a..00acfe9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -ops +ops==1.5.4 requests diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index 4290af8..fe9638b 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -3,7 +3,6 @@ import tempfile from pathlib import Path from subprocess import CalledProcessError, check_output, run -from typing import List import requests @@ -11,7 +10,9 @@ def os_release(): """Return /etc/os-release as a dict.""" os_release_data = Path("/etc/os-release").read_text() - os_release_list = [item.split("=") for item in os_release_data.strip().split("\n")] + os_release_list = [ + item.split("=") for item in os_release_data.strip().split("\n") if item != "" + ] return {k: v.strip('"') for k, v in os_release_list} @@ -65,17 +66,11 @@ class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase): OS_RELEASE = os_release() - def __init__(self): + def __init__(self, driver_package: str = "cuda-drivers"): + self._driver_package = driver_package self._id = self.OS_RELEASE["ID"] self._version_id = self.OS_RELEASE["VERSION_ID"].replace(".", "") self._distribution = f"{self._id}{self._version_id}" - self._cuda_keyring_url = ( - "https://developer.download.nvidia.com/compute/cuda/" - f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb" - ) - self._cuda_sources_list = Path( - f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list" - ) def _install_kernel_headers(self) -> None: """Install the kernel headers.""" @@ -86,13 +81,17 @@ def _install_kernel_headers(self) -> None: def _install_cuda_keyring(self) -> None: """Install the cuda keyring .deb.""" + # Grab the cuda-keyring.deb from the url. try: - r = requests.get(self._cuda_keyring_url) + r = requests.get( + "https://developer.download.nvidia.com/compute/cuda/" + f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb" + ) except requests.exceptions.HTTPError: raise NvidiaDriverOpsError( f"Error downloading cuda keyring from {self._cuda_keyring_url}" ) - + # Write the cuda-keyring.deb to a tmp file and install it with dpkg. with tempfile.TemporaryDirectory() as tmpdir: cuda_keyring_deb = f"{tmpdir}/cuda-keyring.deb" Path(cuda_keyring_deb).write_bytes(r.content) @@ -100,15 +99,16 @@ def _install_cuda_keyring(self) -> None: run(["dpkg", "-i", cuda_keyring_deb]) except CalledProcessError: raise NvidiaDriverOpsError("Error installing cuda keyring .deb.") - try: - run(["apt-get", "update"]) - except CalledProcessError: - raise NvidiaDriverOpsError("Error running `apt-get update`.") + # Run apt-get update + try: + run(["apt-get", "update"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error running `apt-get update`.") def _install_cuda_drivers(self) -> None: """Install the cuda drivers.""" try: - run(["apt-get", "install", "-y", "cuda-drivers"]) + run(["apt-get", "install", "-y", self._driver_package]) except CalledProcessError: raise NvidiaDriverOpsError("Error installing cuda drivers.") @@ -121,11 +121,11 @@ def install(self) -> None: def remove(self) -> None: """Remove cuda drivers from the os.""" try: - run(["apt-get", "-y", "remove", "--purge", "cuda-drivers"]) + run(["apt-get", "-y", "remove", "--purge", self._driver_package]) except CalledProcessError: raise NvidiaDriverOpsError("Error removing cuda-drivers.") - self._cuda_sources_list.unlink() + Path(f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list").unlink() try: run(["apt-get", "update"]) @@ -135,7 +135,7 @@ def remove(self) -> None: def version(self) -> str: """Return the cuda-drivers package version.""" try: - p = check_output(["apt-cache", "policy", "cuda-drivers"]) + p = check_output(["apt-cache", "policy", self._driver_package]) except CalledProcessError: raise NvidiaDriverOpsError("Error running `apt-cache policy cuda-drivers.") @@ -153,7 +153,17 @@ class NvidiaOpsManagerCentos(NvidiaOpsManagerBase): def __init__(self, driver_package): """Initialize class level variables.""" - self.PACKAGE_DEPS = [ + self._driver_package = driver_package + self._nvidia_driver_repo_filepath = Path("/etc/yum.repos.d/cuda-rhel7.repo") + + def install(self) -> None: + """Install nvidia drivers. + + Install the Nvidia drivers as defined in the Nvidia documentation: + https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#centos7 + """ + # Install Nvidia driver dependencies. + deps = [ "tar", "bzip2", "make", @@ -169,57 +179,47 @@ def __init__(self, driver_package): "bind-utils", "wget", ] - self.EPEL_RELEASE_REPO = ( - "https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm" - ) - self.NVIDIA_DRIVER_PACKAGE = driver_package - self.NVIDIA_DRIVER_REPO_FILEPATH = Path("/etc/yum.repos.d/cuda-rhel7.repo") + try: + run(["yum", "install", "-y"] + deps) + except CalledProcessError: + raise NvidiaDriverOpsError("Error installing driver dependencies.") - @property - def _nvidia_developer_repo(self) -> str: - """Generate and return the Nvidia developer repo url.""" - return ( + # Grab the repo file and write it to the /etc/yum.repos.d/. + nvidia_developer_repo = ( "http://developer.download.nvidia.com/compute/cuda/repos/rhel7/" f"{self._arch}/cuda-rhel7.repo" ) - - @property - def _kernel_packages(self) -> List: - """Return the appropriate kernel devel and header packages for the current kernel.""" - return [f"kernel-devel-{self._uname_r}", f"kernel-headers-{self._uname_r}"] - - def install(self) -> None: - """Install nvidia drivers. - - Install the Nvidia drivers as defined in the Nvidia documentation: - https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#centos7 - """ - # Install Nvidia driver dependencies. try: - run(["yum", "install", "-y"] + self.PACKAGE_DEPS) - except CalledProcessError: - raise NvidiaDriverOpsError("Error installing driver dependencies.") - # Grab the correct repo file and write it to the /etc/yum.repos.d/. - try: - req = requests.get(self._nvidia_developer_repo) + req = requests.get(nvidia_developer_repo) except requests.exceptions.HTTPError: raise NvidiaDriverOpsError( - f"Error getting nvidia_developer_repository from {self._nvidia_developer_repo}." + f"Error getting nvidia_developer_repository from {nvidia_developer_repo}." ) - self.NVIDIA_DRIVER_REPO_FILEPATH.write_text(req.text) + self._nvidia_driver_repo_filepath.write_text(req.text) + # Add the devel kernel and kernel headers. try: - run(["yum", "install", "-y"] + self._kernel_packages) + run( + [ + "yum", + "install", + "-y", + f"kernel-devel-{self._uname_r}", + f"kernel-headers-{self._uname_r}", + ] + ) except CalledProcessError: raise NvidiaDriverOpsError("Error installing devel kernel headers.") + # Expire the cache and update repos. try: run(["yum", "clean", "expire-cache"]) except CalledProcessError: raise NvidiaDriverOpsError("Error flushing the cache.") + # Install nvidia-driver package.. try: - run(["yum", "install", "-y", self.NVIDIA_DRIVER_PACKAGE]) + run(["yum", "install", "-y", self._driver_package]) except CalledProcessError: raise NvidiaDriverOpsError("Error installing nvidia drivers.") @@ -227,28 +227,17 @@ def remove(self) -> None: """Remove nvidia drivers from the system.""" # Remove nvidia-driver package.. try: - run(["yum", "erase", "-y", self.NVIDIA_DRIVER_PACKAGE]) + run(["yum", "erase", "-y", self._driver_package]) except CalledProcessError: raise NvidiaDriverOpsError("Error removing nvidia drivers from the system.") # Remove the drivers repo - if self.NVIDIA_DRIVER_REPO_FILEPATH.exists(): - self.NVIDIA_DRIVER_REPO_FILEPATH.unlink() + if self._nvidia_driver_repo_filepath.exists(): + self._nvidia_driver_repo_filepath.unlink() # Expire the cache and update repos. try: run(["yum", "clean", "expire-cache"]) except CalledProcessError: raise NvidiaDriverOpsError("Error flushing the cache.") - # Remove the devel kernel and kernel headers. - try: - run(["yum", "erase", "-y"] + self._kernel_packages) - except CalledProcessError: - raise NvidiaDriverOpsError("Error removing devel kernel headers.") - # Remove Nvidia driver dependencies. - for i in self.PACKAGE_DEPS: - try: - run(["yum", "erase", "-y", i]) - except CalledProcessError: - raise NvidiaDriverOpsError(f"Error removing {i}.") def version(self): """Return the version of nvidia-driver-latest-dkms.""" @@ -260,7 +249,7 @@ def version(self): "-q", "--queryformat", "'%{VERSION}'", - self.NVIDIA_DRIVER_PACKAGE, + self._driver_package, ] ) except CalledProcessError: From db2bcbbbefea2929184f9ba9fb97a263b07b1e13 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 17:48:13 +0000 Subject: [PATCH 2/4] create separate configs for centos and ubuntu driver names --- config.yaml | 11 ++++++++--- src/charm.py | 8 ++++++-- src/nvidia_ops_manager.py | 8 ++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/config.yaml b/config.yaml index cd71b55..eee88db 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,11 @@ options: - driver-package: + centos-driver-package: type: string - default: "nvidia-driver-latest-dkms" + default: "" description: | - Driver package to be installed (centos only). + Driver package to be installed on centos7. Default is `nvidia-driver-latest-dkms`. + ubuntu-driver-package: + type: string + default: "" + description: | + Driver package to be installed on ubuntu. Default is `cuda-drivers`. diff --git a/src/charm.py b/src/charm.py index bd0f364..5de6d15 100755 --- a/src/charm.py +++ b/src/charm.py @@ -23,9 +23,13 @@ def __init__(self, *args): super().__init__(*args) if os_release()["ID"] == "ubuntu": - self._nvidia_ops_manager = NvidiaOpsManagerUbuntu() + self._nvidia_ops_manager = NvidiaOpsManagerUbuntu( + self.config.get("ubuntu-driver-package") + ) else: - self._nvidia_ops_manager = NvidiaOpsManagerCentos(self.config.get("driver-package")) + self._nvidia_ops_manager = NvidiaOpsManagerCentos( + self.config.get("centos-driver-package") + ) event_handler_bindings = { self.on.install: self._on_install, diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index fe9638b..3394f95 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -66,8 +66,8 @@ class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase): OS_RELEASE = os_release() - def __init__(self, driver_package: str = "cuda-drivers"): - self._driver_package = driver_package + def __init__(self, driver_package: str = ""): + self._driver_package = driver_package if driver_package else "cuda-drivers" self._id = self.OS_RELEASE["ID"] self._version_id = self.OS_RELEASE["VERSION_ID"].replace(".", "") self._distribution = f"{self._id}{self._version_id}" @@ -151,9 +151,9 @@ def version(self) -> str: class NvidiaOpsManagerCentos(NvidiaOpsManagerBase): """NvidiaOpsManager for Centos7.""" - def __init__(self, driver_package): + def __init__(self, driver_package: str = ""): """Initialize class level variables.""" - self._driver_package = driver_package + self._driver_package = driver_package if driver_package else "nvidia-driver-latest-dkms" self._nvidia_driver_repo_filepath = Path("/etc/yum.repos.d/cuda-rhel7.repo") def install(self) -> None: From e6c3fdff505eaf3cfc92f2d0aaa7b619d6f39d34 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 18:15:09 +0000 Subject: [PATCH 3/4] add driver-package defaults to config.yaml --- config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index eee88db..8882572 100644 --- a/config.yaml +++ b/config.yaml @@ -1,11 +1,11 @@ options: centos-driver-package: type: string - default: "" + default: "nvidia-driver-latest-dkms" description: | Driver package to be installed on centos7. Default is `nvidia-driver-latest-dkms`. ubuntu-driver-package: type: string - default: "" + default: "cuda-drivers" description: | Driver package to be installed on ubuntu. Default is `cuda-drivers`. From 9d85779fafe27034d6b0243a32f48865f23ca4e9 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 20:33:29 +0000 Subject: [PATCH 4/4] remove duplicate specification of driver package --- src/nvidia_ops_manager.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index 3394f95..6516029 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -66,8 +66,8 @@ class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase): OS_RELEASE = os_release() - def __init__(self, driver_package: str = ""): - self._driver_package = driver_package if driver_package else "cuda-drivers" + def __init__(self, driver_package: str): + self._driver_package = driver_package self._id = self.OS_RELEASE["ID"] self._version_id = self.OS_RELEASE["VERSION_ID"].replace(".", "") self._distribution = f"{self._id}{self._version_id}" @@ -151,9 +151,9 @@ def version(self) -> str: class NvidiaOpsManagerCentos(NvidiaOpsManagerBase): """NvidiaOpsManager for Centos7.""" - def __init__(self, driver_package: str = ""): + def __init__(self, driver_package: str): """Initialize class level variables.""" - self._driver_package = driver_package if driver_package else "nvidia-driver-latest-dkms" + self._driver_package = driver_package self._nvidia_driver_repo_filepath = Path("/etc/yum.repos.d/cuda-rhel7.repo") def install(self) -> None: