From 623d050d667c9b6f5acc8e820400eacbe57914a8 Mon Sep 17 00:00:00 2001 From: mkjpryor Date: Tue, 9 Jan 2024 11:56:00 +0000 Subject: [PATCH] Update nvidia-gpu-operator addon to v23.9.1 --- manifests.yaml | 619 ++++++++++++++++++++++ skopeo-manifests/nvidia-gpu-operator.yaml | 32 ++ 2 files changed, 651 insertions(+) create mode 100644 manifests.yaml create mode 100644 skopeo-manifests/nvidia-gpu-operator.yaml diff --git a/manifests.yaml b/manifests.yaml new file mode 100644 index 000000000..2d5a2947d --- /dev/null +++ b/manifests.yaml @@ -0,0 +1,619 @@ +--- +# Source: gpu-operator/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-operator + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" +--- +# Source: gpu-operator/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gpu-operator + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" +rules: +- apiGroups: + - config.openshift.io + resources: + - proxies + verbs: + - get +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + - clusterroles + - clusterrolebindings + verbs: + - '*' +- apiGroups: + - "" + resources: + - pods + - services + - endpoints + - persistentvolumeclaims + - events + - configmaps + - secrets + - serviceaccounts + - nodes + verbs: + - '*' +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - create + - watch + - update + - patch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - '*' +- apiGroups: + - apps + resources: + - controllerrevisions + verbs: + - 'get' + - 'list' + - 'watch' +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + - prometheusrules + verbs: + - get + - list + - create + - watch + - update + - delete +- apiGroups: + - nvidia.com + resources: + - '*' + verbs: + - '*' +- apiGroups: + - scheduling.k8s.io + resources: + - priorityclasses + verbs: + - get + - list + - watch + - create +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - '*' +- apiGroups: + - policy + resources: + - podsecuritypolicies + verbs: + - use + resourceNames: + - gpu-operator-restricted +- apiGroups: + - policy + resources: + - podsecuritypolicies + verbs: + - create + - get + - update + - list + - delete +- apiGroups: + - config.openshift.io + resources: + - clusterversions + verbs: + - get + - list + - watch +- apiGroups: + - "" + - coordination.k8s.io + resources: + - configmaps + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - create + - update + - watch + - delete +- apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods + - pods/eviction + verbs: + - get + - list + - watch + - create + - delete + - update + - patch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - create + - update + - patch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - update + - patch + - create +--- +# Source: gpu-operator/templates/rolebinding.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: gpu-operator + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" +subjects: +- kind: ServiceAccount + name: gpu-operator + namespace: default +- kind: ServiceAccount + name: node-feature-discovery + namespace: default +roleRef: + kind: ClusterRole + name: gpu-operator + apiGroup: rbac.authorization.k8s.io +--- +# Source: gpu-operator/templates/operator.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-operator + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" + nvidia.com/gpu-driver-upgrade-drain.skip: "true" +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: "gpu-operator" + app: "gpu-operator" + template: + metadata: + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" + app: "gpu-operator" + nvidia.com/gpu-driver-upgrade-drain.skip: "true" + annotations: + openshift.io/scc: restricted-readonly + spec: + serviceAccountName: gpu-operator + priorityClassName: system-node-critical + containers: + - name: gpu-operator + image: nvcr.io/nvidia/gpu-operator:v23.9.1 + imagePullPolicy: IfNotPresent + command: ["gpu-operator"] + args: + - --leader-elect + - --zap-time-encoding=epoch + - --zap-log-level=info + env: + - name: WATCH_NAMESPACE + value: "" + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: "DRIVER_MANAGER_IMAGE" + value: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.6.2" + volumeMounts: + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 350Mi + requests: + cpu: 200m + memory: 100Mi + ports: + - name: metrics + containerPort: 8080 + volumes: + - name: host-os-release + hostPath: + path: "/etc/os-release" + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/master + operator: In + values: + - "" + weight: 1 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: In + values: + - "" + weight: 1 + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Equal + value: "" + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Equal + value: "" +--- +# Source: gpu-operator/templates/clusterpolicy.yaml +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: cluster-policy + labels: + app.kubernetes.io/name: gpu-operator + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/version: "v23.9.1" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: "gpu-operator" +spec: + operator: + defaultRuntime: docker + runtimeClass: nvidia + initContainer: + repository: nvcr.io/nvidia + image: cuda + version: "12.3.1-base-ubi8" + imagePullPolicy: IfNotPresent + daemonsets: + labels: + helm.sh/chart: gpu-operator-v23.9.1 + app.kubernetes.io/managed-by: gpu-operator + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + priorityClassName: system-node-critical + updateStrategy: RollingUpdate + rollingUpdate: + maxUnavailable: "1" + validator: + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + version: "v23.9.1" + imagePullPolicy: IfNotPresent + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + + mig: + strategy: single + psp: + enabled: false + psa: + enabled: false + cdi: + enabled: false + default: false + driver: + enabled: true + useNvidiaDriverCRD: false + useOpenKernelModules: false + usePrecompiled: false + repository: nvcr.io/nvidia + image: driver + version: "535.129.03" + imagePullPolicy: IfNotPresent + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + rdma: + enabled: false + useHostMofed: false + manager: + repository: nvcr.io/nvidia/cloud-native + image: k8s-driver-manager + version: "v0.6.5" + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: 0s + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + repoConfig: + configMapName: "" + certConfig: + name: "" + licensingConfig: + configMapName: "" + nlsEnabled: true + virtualTopology: + config: "" + kernelModuleConfig: + name: "" + upgradePolicy: + autoUpgrade: true + maxParallelUpgrades: 1 + maxUnavailable : 25% + waitForCompletion: + timeoutSeconds: 0 + podDeletion: + force: false + timeoutSeconds: 300 + deleteEmptyDir: false + drain: + enable: false + force: false + timeoutSeconds: 300 + deleteEmptyDir: false + vgpuManager: + enabled: false + image: vgpu-manager + imagePullPolicy: IfNotPresent + driverManager: + repository: nvcr.io/nvidia/cloud-native + image: k8s-driver-manager + version: "v0.6.4" + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + kataManager: + enabled: false + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + runtimeClasses: + - artifacts: + pullSecret: "" + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + name: kata-qemu-nvidia-gpu + nodeSelector: {} + - artifacts: + pullSecret: "" + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + name: kata-qemu-nvidia-gpu-snp + nodeSelector: + nvidia.com/cc.capable: "true" + repository: nvcr.io/nvidia/cloud-native + image: k8s-kata-manager + version: "v0.1.2" + imagePullPolicy: IfNotPresent + vfioManager: + enabled: true + repository: nvcr.io/nvidia + image: cuda + version: "12.3.1-base-ubi8" + imagePullPolicy: IfNotPresent + driverManager: + repository: nvcr.io/nvidia/cloud-native + image: k8s-driver-manager + version: "v0.6.2" + imagePullPolicy: IfNotPresent + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + vgpuDeviceManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: vgpu-device-manager + version: "v0.2.4" + imagePullPolicy: IfNotPresent + config: + default: default + name: "" + ccManager: + enabled: false + defaultMode: "off" + repository: nvcr.io/nvidia/cloud-native + image: k8s-cc-manager + version: "v0.1.1" + imagePullPolicy: IfNotPresent + env: + [] + toolkit: + enabled: true + repository: nvcr.io/nvidia/k8s + image: container-toolkit + version: "v1.14.3-ubuntu20.04" + imagePullPolicy: IfNotPresent + installDir: /usr/local/nvidia + devicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: k8s-device-plugin + version: "v0.14.3-ubi8" + imagePullPolicy: IfNotPresent + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + dcgm: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: dcgm + version: "3.3.0-1-ubuntu22.04" + imagePullPolicy: IfNotPresent + hostPort: 5555 + dcgmExporter: + enabled: true + repository: nvcr.io/nvidia/k8s + image: dcgm-exporter + version: "3.3.0-3.2.0-ubuntu22.04" + imagePullPolicy: IfNotPresent + env: + - name: DCGM_EXPORTER_LISTEN + value: :9400 + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: /etc/dcgm-exporter/dcp-metrics-included.csv + serviceMonitor: + additionalLabels: {} + enabled: false + honorLabels: false + interval: 15s + relabelings: [] + gfd: + enabled: true + repository: nvcr.io/nvidia + image: gpu-feature-discovery + version: "v0.8.2-ubi8" + imagePullPolicy: IfNotPresent + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + migManager: + enabled: true + repository: nvcr.io/nvidia/cloud-native + image: k8s-mig-manager + version: "v0.5.5-ubuntu20.04" + imagePullPolicy: IfNotPresent + env: + - name: WITH_REBOOT + value: "false" + config: + default: all-disabled + name: default-mig-parted-config + gpuClientsConfig: + name: "" + nodeStatusExporter: + enabled: false + repository: nvcr.io/nvidia/cloud-native + image: gpu-operator-validator + version: "v23.9.1" + imagePullPolicy: IfNotPresent + sandboxWorkloads: + enabled: false + defaultWorkload: container + sandboxDevicePlugin: + enabled: true + repository: nvcr.io/nvidia + image: kubevirt-gpu-device-plugin + version: "v1.2.4" + imagePullPolicy: IfNotPresent diff --git a/skopeo-manifests/nvidia-gpu-operator.yaml b/skopeo-manifests/nvidia-gpu-operator.yaml new file mode 100644 index 000000000..7b6068b6c --- /dev/null +++ b/skopeo-manifests/nvidia-gpu-operator.yaml @@ -0,0 +1,32 @@ +nvcr.io: + images: + nvidia/cloud-native/dcgm: + - 3.3.0-1-ubuntu22.04 + nvidia/cloud-native/gpu-operator-validator: + - v23.9.1 + nvidia/cloud-native/k8s-cc-manager: + - v0.1.1 + nvidia/cloud-native/k8s-driver-manager: + - v0.6.2 + - v0.6.4 + - v0.6.5 + nvidia/cloud-native/k8s-kata-manager: + - v0.1.2 + nvidia/cloud-native/k8s-mig-manager: + - v0.5.5-ubuntu20.04 + nvidia/cloud-native/vgpu-device-manager: + - v0.2.4 + nvidia/cuda: + - 12.3.1-base-ubi8 + nvidia/gpu-feature-discovery: + - v0.8.2-ubi8 + nvidia/gpu-operator: + - v23.9.1 + nvidia/k8s-device-plugin: + - v0.14.3-ubi8 + nvidia/k8s/container-toolkit: + - v1.14.3-ubuntu20.04 + nvidia/k8s/dcgm-exporter: + - 3.3.0-3.2.0-ubuntu22.04 + nvidia/kubevirt-gpu-device-plugin: + - v1.2.4