Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maintain the er object to support GPU sharing #10

Open
wants to merge 3 commits into
base: release-1.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
519 changes: 499 additions & 20 deletions Gopkg.lock

Large diffs are not rendered by default.

16 changes: 10 additions & 6 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,20 @@
version = "1.4.7"

[[constraint]]
branch = "master"
name = "golang.org/x/net"
name = "google.golang.org/grpc"
version = "1.27.1"

[[constraint]]
name = "google.golang.org/grpc"
version = "1.24.0"
name = "k8s.io/client-go"
version = "0.17.3"

[[constraint]]
name = "k8s.io/kubelet"
version = "0.17.3"

[[constraint]]
name = "k8s.io/kubernetes"
version = "1.16.0"
name = "github.com/caicloud/clientset"
branch = "release-2.11"

[prune]
go-tests = true
Expand Down
9 changes: 9 additions & 0 deletions OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
approvers:
- pendoragon
- angao
- zw0610

reviewers:
- pendoragon
- angao
- zw0610
30 changes: 25 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@ import (
"syscall"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
resourcev1beta1 "github.com/caicloud/clientset/customclient/typed/resource/v1beta1"
"github.com/caicloud/clientset/kubernetes"
"github.com/fsnotify/fsnotify"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/client-go/rest"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

func getAllPlugins() []*NvidiaDevicePlugin {
func getAllPlugins(clientset resourcev1beta1.ExtendedResourceInterface) []*NvidiaDevicePlugin {
return []*NvidiaDevicePlugin{
NewNvidiaDevicePlugin(
clientset,
"nvidia.com/gpu",
NewGpuDeviceManager(),
"NVIDIA_VISIBLE_DEVICES",
pluginapi.DevicePluginPath + "nvidia.sock"),
pluginapi.DevicePluginPath+"nvidia.sock"),
}
}

Expand All @@ -59,8 +63,10 @@ func main() {
log.Println("Starting OS watcher.")
sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)

clientset := createClientset()

log.Println("Retreiving plugins.")
plugins := getAllPlugins()
plugins := getAllPlugins(clientset)

restart:
// Loop through all plugins, idempotently stopping them, and then starting
Expand All @@ -72,7 +78,8 @@ restart:
p.Stop()

// Just continue if there are no devices to serve for plugin p.
if len(p.Devices()) == 0 {
devs, _ := p.Devices()
if len(devs) == 0 {
continue
}

Expand Down Expand Up @@ -131,3 +138,16 @@ events:
}
}
}

func createClientset() resourcev1beta1.ExtendedResourceInterface {
config, err := rest.InClusterConfig()
if err != nil {
log.Panicf("unable to create kubeconfig: %+v", err)
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Panicf("unable to create clientset: %+v", err)
}
return clientset.Custom().ResourceV1beta1().ExtendedResources()
}
35 changes: 35 additions & 0 deletions nvidia-device-plugin-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nvidia-device-plugin-cr
rules:
- apiGroups:
- "resource.caicloud.io"
resources:
- extendedresources
verbs:
- create
- get
- list
- update
- delete
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-device-plugin-sa
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nvidia-device-plugin-crb
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-device-plugin-cr
subjects:
- kind: ServiceAccount
name: nvidia-device-plugin-sa
namespace: kube-system
24 changes: 9 additions & 15 deletions nvidia-device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
name: nvidia-device-plugin-caicloud
namespace: kube-system
spec:
selector:
Expand All @@ -25,29 +25,23 @@ spec:
type: RollingUpdate
template:
metadata:
# This annotation is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
serviceAccount: nvidia-device-plugin-sa
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvidia/k8s-device-plugin:1.0.0-beta4
name: nvidia-device-plugin-ctr
- image: cargo.dev.caicloud.xyz/release/nvidia-device-plugin-cus:1.1.0
name: nvidia-device-plugin
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
15 changes: 8 additions & 7 deletions nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ import (
"strings"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"

pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

const (
Expand All @@ -32,11 +31,11 @@ const (
)

type ResourceManager interface {
Devices() []*pluginapi.Device
Devices() ([]*pluginapi.Device, []*nvml.Device)
CheckHealth(stop <-chan interface{}, devices []*pluginapi.Device, unhealthy chan<- *pluginapi.Device)
}

type GpuDeviceManager struct {}
type GpuDeviceManager struct{}

func check(err error) {
if err != nil {
Expand All @@ -48,18 +47,20 @@ func NewGpuDeviceManager() *GpuDeviceManager {
return &GpuDeviceManager{}
}

func (g *GpuDeviceManager) Devices() []*pluginapi.Device {
func (g *GpuDeviceManager) Devices() ([]*pluginapi.Device, []*nvml.Device) {
n, err := nvml.GetDeviceCount()
check(err)

var devs []*pluginapi.Device
var nvmlDevs []*nvml.Device
for i := uint(0); i < n; i++ {
d, err := nvml.NewDeviceLite(i)
d, err := nvml.NewDevice(i)
check(err)
devs = append(devs, buildPluginDevice(d))
nvmlDevs = append(nvmlDevs, d)
}

return devs
return devs, nvmlDevs
}

func (g *GpuDeviceManager) CheckHealth(stop <-chan interface{}, devices []*pluginapi.Device, unhealthy chan<- *pluginapi.Device) {
Expand Down
Loading