Skip to content

Commit

Permalink
Merge pull request #3 from jamesbeedy/cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbeedy authored Mar 2, 2023
2 parents 6a8a755 + 9d85779 commit 3c399ed
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 78 deletions.
9 changes: 7 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
options:
driver-package:
centos-driver-package:
type: string
default: "nvidia-driver-latest-dkms"
description: |
Driver package to be installed (centos only).
Driver package to be installed on centos7. Default is `nvidia-driver-latest-dkms`.
ubuntu-driver-package:
type: string
default: "cuda-drivers"
description: |
Driver package to be installed on ubuntu. Default is `cuda-drivers`.
2 changes: 1 addition & 1 deletion dispatch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# This hook installs the centos dependencies needed to run the charm code.
# Install the centos dependencies needed to run the charm code.

set -e

Expand Down
5 changes: 2 additions & 3 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ maintainers:
- OmniVector Solutions <[email protected]>

description: |
The Nvidia Operator Charm will install nvidia drivers
to the underlying operating system upon being related to a primary
charm via the juju-info integration.
This charm will install nvidia drivers for centos7 and ubuntu
20.04 & 22.04 operating systems.
tags:
- hpc
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ops
ops==1.5.4
requests
8 changes: 6 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@ def __init__(self, *args):
super().__init__(*args)

if os_release()["ID"] == "ubuntu":
self._nvidia_ops_manager = NvidiaOpsManagerUbuntu()
self._nvidia_ops_manager = NvidiaOpsManagerUbuntu(
self.config.get("ubuntu-driver-package")
)
else:
self._nvidia_ops_manager = NvidiaOpsManagerCentos(self.config.get("driver-package"))
self._nvidia_ops_manager = NvidiaOpsManagerCentos(
self.config.get("centos-driver-package")
)

event_handler_bindings = {
self.on.install: self._on_install,
Expand Down
127 changes: 58 additions & 69 deletions src/nvidia_ops_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import tempfile
from pathlib import Path
from subprocess import CalledProcessError, check_output, run
from typing import List

import requests


def os_release():
"""Return /etc/os-release as a dict."""
os_release_data = Path("/etc/os-release").read_text()
os_release_list = [item.split("=") for item in os_release_data.strip().split("\n")]
os_release_list = [
item.split("=") for item in os_release_data.strip().split("\n") if item != ""
]
return {k: v.strip('"') for k, v in os_release_list}


Expand Down Expand Up @@ -65,17 +66,11 @@ class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase):

OS_RELEASE = os_release()

def __init__(self):
def __init__(self, driver_package: str):
self._driver_package = driver_package
self._id = self.OS_RELEASE["ID"]
self._version_id = self.OS_RELEASE["VERSION_ID"].replace(".", "")
self._distribution = f"{self._id}{self._version_id}"
self._cuda_keyring_url = (
"https://developer.download.nvidia.com/compute/cuda/"
f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb"
)
self._cuda_sources_list = Path(
f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list"
)

def _install_kernel_headers(self) -> None:
"""Install the kernel headers."""
Expand All @@ -86,29 +81,34 @@ def _install_kernel_headers(self) -> None:

def _install_cuda_keyring(self) -> None:
"""Install the cuda keyring .deb."""
# Grab the cuda-keyring.deb from the url.
try:
r = requests.get(self._cuda_keyring_url)
r = requests.get(
"https://developer.download.nvidia.com/compute/cuda/"
f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb"
)
except requests.exceptions.HTTPError:
raise NvidiaDriverOpsError(
f"Error downloading cuda keyring from {self._cuda_keyring_url}"
)

# Write the cuda-keyring.deb to a tmp file and install it with dpkg.
with tempfile.TemporaryDirectory() as tmpdir:
cuda_keyring_deb = f"{tmpdir}/cuda-keyring.deb"
Path(cuda_keyring_deb).write_bytes(r.content)
try:
run(["dpkg", "-i", cuda_keyring_deb])
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing cuda keyring .deb.")
try:
run(["apt-get", "update"])
except CalledProcessError:
raise NvidiaDriverOpsError("Error running `apt-get update`.")
# Run apt-get update
try:
run(["apt-get", "update"])
except CalledProcessError:
raise NvidiaDriverOpsError("Error running `apt-get update`.")

def _install_cuda_drivers(self) -> None:
"""Install the cuda drivers."""
try:
run(["apt-get", "install", "-y", "cuda-drivers"])
run(["apt-get", "install", "-y", self._driver_package])
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing cuda drivers.")

Expand All @@ -121,11 +121,11 @@ def install(self) -> None:
def remove(self) -> None:
"""Remove cuda drivers from the os."""
try:
run(["apt-get", "-y", "remove", "--purge", "cuda-drivers"])
run(["apt-get", "-y", "remove", "--purge", self._driver_package])
except CalledProcessError:
raise NvidiaDriverOpsError("Error removing cuda-drivers.")

self._cuda_sources_list.unlink()
Path(f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list").unlink()

try:
run(["apt-get", "update"])
Expand All @@ -135,7 +135,7 @@ def remove(self) -> None:
def version(self) -> str:
"""Return the cuda-drivers package version."""
try:
p = check_output(["apt-cache", "policy", "cuda-drivers"])
p = check_output(["apt-cache", "policy", self._driver_package])
except CalledProcessError:
raise NvidiaDriverOpsError("Error running `apt-cache policy cuda-drivers.")

Expand All @@ -151,9 +151,19 @@ def version(self) -> str:
class NvidiaOpsManagerCentos(NvidiaOpsManagerBase):
"""NvidiaOpsManager for Centos7."""

def __init__(self, driver_package):
def __init__(self, driver_package: str):
"""Initialize class level variables."""
self.PACKAGE_DEPS = [
self._driver_package = driver_package
self._nvidia_driver_repo_filepath = Path("/etc/yum.repos.d/cuda-rhel7.repo")

def install(self) -> None:
"""Install nvidia drivers.
Install the Nvidia drivers as defined in the Nvidia documentation:
https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#centos7
"""
# Install Nvidia driver dependencies.
deps = [
"tar",
"bzip2",
"make",
Expand All @@ -169,86 +179,65 @@ def __init__(self, driver_package):
"bind-utils",
"wget",
]
self.EPEL_RELEASE_REPO = (
"https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm"
)
self.NVIDIA_DRIVER_PACKAGE = driver_package
self.NVIDIA_DRIVER_REPO_FILEPATH = Path("/etc/yum.repos.d/cuda-rhel7.repo")
try:
run(["yum", "install", "-y"] + deps)
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing driver dependencies.")

@property
def _nvidia_developer_repo(self) -> str:
"""Generate and return the Nvidia developer repo url."""
return (
# Grab the repo file and write it to the /etc/yum.repos.d/.
nvidia_developer_repo = (
"http://developer.download.nvidia.com/compute/cuda/repos/rhel7/"
f"{self._arch}/cuda-rhel7.repo"
)

@property
def _kernel_packages(self) -> List:
"""Return the appropriate kernel devel and header packages for the current kernel."""
return [f"kernel-devel-{self._uname_r}", f"kernel-headers-{self._uname_r}"]

def install(self) -> None:
"""Install nvidia drivers.
Install the Nvidia drivers as defined in the Nvidia documentation:
https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#centos7
"""
# Install Nvidia driver dependencies.
try:
run(["yum", "install", "-y"] + self.PACKAGE_DEPS)
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing driver dependencies.")
# Grab the correct repo file and write it to the /etc/yum.repos.d/.
try:
req = requests.get(self._nvidia_developer_repo)
req = requests.get(nvidia_developer_repo)
except requests.exceptions.HTTPError:
raise NvidiaDriverOpsError(
f"Error getting nvidia_developer_repository from {self._nvidia_developer_repo}."
f"Error getting nvidia_developer_repository from {nvidia_developer_repo}."
)
self.NVIDIA_DRIVER_REPO_FILEPATH.write_text(req.text)
self._nvidia_driver_repo_filepath.write_text(req.text)

# Add the devel kernel and kernel headers.
try:
run(["yum", "install", "-y"] + self._kernel_packages)
run(
[
"yum",
"install",
"-y",
f"kernel-devel-{self._uname_r}",
f"kernel-headers-{self._uname_r}",
]
)
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing devel kernel headers.")

# Expire the cache and update repos.
try:
run(["yum", "clean", "expire-cache"])
except CalledProcessError:
raise NvidiaDriverOpsError("Error flushing the cache.")

# Install nvidia-driver package..
try:
run(["yum", "install", "-y", self.NVIDIA_DRIVER_PACKAGE])
run(["yum", "install", "-y", self._driver_package])
except CalledProcessError:
raise NvidiaDriverOpsError("Error installing nvidia drivers.")

def remove(self) -> None:
"""Remove nvidia drivers from the system."""
# Remove nvidia-driver package..
try:
run(["yum", "erase", "-y", self.NVIDIA_DRIVER_PACKAGE])
run(["yum", "erase", "-y", self._driver_package])
except CalledProcessError:
raise NvidiaDriverOpsError("Error removing nvidia drivers from the system.")
# Remove the drivers repo
if self.NVIDIA_DRIVER_REPO_FILEPATH.exists():
self.NVIDIA_DRIVER_REPO_FILEPATH.unlink()
if self._nvidia_driver_repo_filepath.exists():
self._nvidia_driver_repo_filepath.unlink()
# Expire the cache and update repos.
try:
run(["yum", "clean", "expire-cache"])
except CalledProcessError:
raise NvidiaDriverOpsError("Error flushing the cache.")
# Remove the devel kernel and kernel headers.
try:
run(["yum", "erase", "-y"] + self._kernel_packages)
except CalledProcessError:
raise NvidiaDriverOpsError("Error removing devel kernel headers.")
# Remove Nvidia driver dependencies.
for i in self.PACKAGE_DEPS:
try:
run(["yum", "erase", "-y", i])
except CalledProcessError:
raise NvidiaDriverOpsError(f"Error removing {i}.")

def version(self):
"""Return the version of nvidia-driver-latest-dkms."""
Expand All @@ -260,7 +249,7 @@ def version(self):
"-q",
"--queryformat",
"'%{VERSION}'",
self.NVIDIA_DRIVER_PACKAGE,
self._driver_package,
]
)
except CalledProcessError:
Expand Down

0 comments on commit 3c399ed

Please sign in to comment.