Skip to content

Commit

Permalink
feat: maintain the er to support gpu sharing
Browse files Browse the repository at this point in the history
  • Loading branch information
angao committed Mar 2, 2020
1 parent 0dd599e commit e2bcd01
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 45 deletions.
30 changes: 25 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@ import (
"syscall"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
resourcev1beta1 "github.com/caicloud/clientset/customclient/typed/resource/v1beta1"
"github.com/caicloud/clientset/kubernetes"
"github.com/fsnotify/fsnotify"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/client-go/rest"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

func getAllPlugins() []*NvidiaDevicePlugin {
func getAllPlugins(clientset resourcev1beta1.ExtendedResourceInterface) []*NvidiaDevicePlugin {
return []*NvidiaDevicePlugin{
NewNvidiaDevicePlugin(
clientset,
"nvidia.com/gpu",
NewGpuDeviceManager(),
"NVIDIA_VISIBLE_DEVICES",
pluginapi.DevicePluginPath + "nvidia.sock"),
pluginapi.DevicePluginPath+"nvidia.sock"),
}
}

Expand All @@ -59,8 +63,10 @@ func main() {
log.Println("Starting OS watcher.")
sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)

clientset := createClientset()

log.Println("Retreiving plugins.")
plugins := getAllPlugins()
plugins := getAllPlugins(clientset)

restart:
// Loop through all plugins, idempotently stopping them, and then starting
Expand All @@ -72,7 +78,8 @@ restart:
p.Stop()

// Just continue if there are no devices to serve for plugin p.
if len(p.Devices()) == 0 {
devs, _ := p.Devices()
if len(devs) == 0 {
continue
}

Expand Down Expand Up @@ -131,3 +138,16 @@ events:
}
}
}

func createClientset() resourcev1beta1.ExtendedResourceInterface {
config, err := rest.InClusterConfig()
if err != nil {
log.Panicf("unable to create kubeconfig: %+v", err)
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Panicf("unable to create clientset: %+v", err)
}
return clientset.Custom().ResourceV1beta1().ExtendedResources()
}
35 changes: 35 additions & 0 deletions nvidia-device-plugin-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nvidia-device-plugin-cr
rules:
- apiGroups:
- "resource.caicloud.io"
resources:
- extendedresources
verbs:
- create
- get
- list
- update
- delete
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-device-plugin-sa
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nvidia-device-plugin-crb
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-device-plugin-cr
subjects:
- kind: ServiceAccount
name: nvidia-device-plugin-sa
namespace: kube-system
24 changes: 9 additions & 15 deletions nvidia-device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
name: nvidia-device-plugin-caicloud
namespace: kube-system
spec:
selector:
Expand All @@ -25,29 +25,23 @@ spec:
type: RollingUpdate
template:
metadata:
# This annotation is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
serviceAccount: nvidia-device-plugin-sa
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvidia/k8s-device-plugin:1.0.0-beta4
name: nvidia-device-plugin-ctr
- image: cargo.dev.caicloud.xyz/release/nvidia-device-plugin-cus:1.1.0
name: nvidia-device-plugin
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
15 changes: 8 additions & 7 deletions nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ import (
"strings"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"

pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

const (
Expand All @@ -32,11 +31,11 @@ const (
)

type ResourceManager interface {
Devices() []*pluginapi.Device
Devices() ([]*pluginapi.Device, []*nvml.Device)
CheckHealth(stop <-chan interface{}, devices []*pluginapi.Device, unhealthy chan<- *pluginapi.Device)
}

type GpuDeviceManager struct {}
type GpuDeviceManager struct{}

func check(err error) {
if err != nil {
Expand All @@ -48,18 +47,20 @@ func NewGpuDeviceManager() *GpuDeviceManager {
return &GpuDeviceManager{}
}

func (g *GpuDeviceManager) Devices() []*pluginapi.Device {
func (g *GpuDeviceManager) Devices() ([]*pluginapi.Device, []*nvml.Device) {
n, err := nvml.GetDeviceCount()
check(err)

var devs []*pluginapi.Device
var nvmlDevs []*nvml.Device
for i := uint(0); i < n; i++ {
d, err := nvml.NewDeviceLite(i)
d, err := nvml.NewDevice(i)
check(err)
devs = append(devs, buildPluginDevice(d))
nvmlDevs = append(nvmlDevs, d)
}

return devs
return devs, nvmlDevs
}

func (g *GpuDeviceManager) CheckHealth(stop <-chan interface{}, devices []*pluginapi.Device, unhealthy chan<- *pluginapi.Device) {
Expand Down
Loading

0 comments on commit e2bcd01

Please sign in to comment.