diff --git a/.golangci.yml b/.golangci.yml index d9cab6c7a..283d4a089 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -34,3 +34,8 @@ issues: linters: - errcheck text: config.Delete + # RENDERD refers to the Render Device and not the past tense of render. + - path: .*.go + linters: + - misspell + text: "`RENDERD` is a misspelling of `RENDERED`" diff --git a/internal/discover/cache.go b/internal/discover/cache.go new file mode 100644 index 000000000..e9e4b615b --- /dev/null +++ b/internal/discover/cache.go @@ -0,0 +1,80 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package discover + +import "sync" + +type cache struct { + d Discover + + sync.Mutex + devices []Device + hooks []Hook + mounts []Mount +} + +var _ Discover = (*cache)(nil) + +// WithCache decorates the specified disoverer with a cache. +func WithCache(d Discover) Discover { + if d == nil { + return None{} + } + return &cache{d: d} +} + +func (c *cache) Devices() ([]Device, error) { + c.Lock() + defer c.Unlock() + + if c.devices == nil { + devices, err := c.d.Devices() + if err != nil { + return nil, err + } + c.devices = devices + } + return c.devices, nil +} + +func (c *cache) Hooks() ([]Hook, error) { + c.Lock() + defer c.Unlock() + + if c.hooks == nil { + hooks, err := c.d.Hooks() + if err != nil { + return nil, err + } + c.hooks = hooks + } + return c.hooks, nil +} + +func (c *cache) Mounts() ([]Mount, error) { + c.Lock() + defer c.Unlock() + + if c.mounts == nil { + mounts, err := c.d.Mounts() + if err != nil { + return nil, err + } + c.mounts = mounts + } + return c.mounts, nil +} diff --git a/internal/discover/first-valid.go b/internal/discover/first-valid.go new file mode 100644 index 000000000..36de92048 --- /dev/null +++ b/internal/discover/first-valid.go @@ -0,0 +1,72 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package discover + +import "errors" + +type firstOf []Discover + +// FirstValid returns a discoverer that returns the first non-error result from a list of discoverers. +func FirstValid(discoverers ...Discover) Discover { + var f firstOf + for _, d := range discoverers { + if d == nil { + continue + } + f = append(f, d) + } + return f +} + +func (f firstOf) Devices() ([]Device, error) { + var errs error + for _, d := range f { + devices, err := d.Devices() + if err != nil { + errs = errors.Join(errs, err) + continue + } + return devices, nil + } + return nil, errs +} + +func (f firstOf) Hooks() ([]Hook, error) { + var errs error + for _, d := range f { + hooks, err := d.Hooks() + if err != nil { + errs = errors.Join(errs, err) + continue + } + return hooks, nil + } + return nil, errs +} + +func (f firstOf) Mounts() ([]Mount, error) { + var errs error + for _, d := range f { + mounts, err := d.Mounts() + if err != nil { + errs = errors.Join(errs, err) + continue + } + return mounts, nil + } + return nil, nil +} diff --git a/internal/platform-support/dgpu/dgpu.go b/internal/platform-support/dgpu/dgpu.go index b79f6bd42..535f4f1ea 100644 --- a/internal/platform-support/dgpu/dgpu.go +++ b/internal/platform-support/dgpu/dgpu.go @@ -17,6 +17,8 @@ package dgpu import ( + "errors" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" @@ -25,22 +27,79 @@ import ( ) // NewForDevice creates a discoverer for the specified Device. +// nvsandboxutils is used for discovery if specified, otherwise NVML is used. func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { o := new(opts...) + o.isMigDevice = false + + var discoverers []discover.Discover + var errs error + nvsandboxutilsDiscoverer, err := o.newNvsandboxutilsDGPUDiscoverer(d) + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvsandboxutilsDiscoverer != nil { + discoverers = append(discoverers, nvsandboxutilsDiscoverer) + } + + nvmlDiscoverer, err := o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvmlDiscoverer != nil { + discoverers = append(discoverers, nvmlDiscoverer) + } + + if len(discoverers) == 0 { + return nil, errs + } - return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) + return discover.WithCache( + discover.FirstValid( + discoverers..., + ), + ), nil } -// NewForDevice creates a discoverer for the specified device and its associated MIG device. +// NewForMigDevice creates a discoverer for the specified device and its associated MIG device. +// nvsandboxutils is used for discovery if specified, otherwise NVML is used. func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) { o := new(opts...) + o.isMigDevice = true - return o.newNvmlMigDiscoverer( + var discoverers []discover.Discover + var errs error + nvsandboxutilsDiscoverer, err := o.newNvsandboxutilsDGPUDiscoverer(mig) + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvsandboxutilsDiscoverer != nil { + discoverers = append(discoverers, nvsandboxutilsDiscoverer) + } + + nvmlDiscoverer, err := o.newNvmlMigDiscoverer( &toRequiredMigInfo{ MigDevice: mig, parent: &toRequiredInfo{d}, }, ) + if err != nil { + // TODO: Log a warning + errs = errors.Join(errs, err) + } else if nvmlDiscoverer != nil { + discoverers = append(discoverers, nvmlDiscoverer) + } + + if len(discoverers) == 0 { + return nil, errs + } + + return discover.WithCache( + discover.FirstValid( + discoverers..., + ), + ), nil + } func new(opts ...Option) *options { diff --git a/internal/platform-support/dgpu/nvsandboxutils.go b/internal/platform-support/dgpu/nvsandboxutils.go new file mode 100644 index 000000000..7022deab9 --- /dev/null +++ b/internal/platform-support/dgpu/nvsandboxutils.go @@ -0,0 +1,131 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + "path/filepath" + "strings" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" +) + +type nvsandboxutilsDGPU struct { + lib nvsandboxutils.Interface + uuid string + devRoot string + isMig bool + nvidiaCDIHookPath string + deviceLinks []string +} + +var _ discover.Discover = (*nvsandboxutilsDGPU)(nil) + +type UUIDer interface { + GetUUID() (string, nvml.Return) +} + +func (o *options) newNvsandboxutilsDGPUDiscoverer(d UUIDer) (discover.Discover, error) { + if o.nvsandboxutilslib == nil { + return nil, nil + } + + uuid, nvmlRet := d.GetUUID() + if nvmlRet != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device UUID: %w", nvmlRet) + } + + nvd := nvsandboxutilsDGPU{ + lib: o.nvsandboxutilslib, + uuid: uuid, + devRoot: strings.TrimSuffix(filepath.Clean(o.devRoot), "/dev"), + isMig: o.isMigDevice, + nvidiaCDIHookPath: o.nvidiaCDIHookPath, + } + + return &nvd, nil +} + +func (d *nvsandboxutilsDGPU) Devices() ([]discover.Device, error) { + gpuFileInfos, ret := d.lib.GetGpuResource(d.uuid) + if ret != nvsandboxutils.SUCCESS { + return nil, fmt.Errorf("failed to get GPU resource: %w", ret) + } + + var devices []discover.Device + for _, info := range gpuFileInfos { + switch { + case info.SubType == nvsandboxutils.NV_DEV_DRI_CARD, info.SubType == nvsandboxutils.NV_DEV_DRI_RENDERD: + if d.isMig { + continue + } + fallthrough + case info.SubType == nvsandboxutils.NV_DEV_NVIDIA, info.SubType == nvsandboxutils.NV_DEV_NVIDIA_CAPS_NVIDIA_CAP: + containerPath := info.Path + if d.devRoot != "/" { + containerPath = strings.TrimPrefix(containerPath, d.devRoot) + } + + // TODO: Extend discover.Device with additional information. + device := discover.Device{ + HostPath: info.Path, + Path: containerPath, + } + devices = append(devices, device) + case info.SubType == nvsandboxutils.NV_DEV_DRI_CARD_SYMLINK, info.SubType == nvsandboxutils.NV_DEV_DRI_RENDERD_SYMLINK: + if d.isMig { + continue + } + if info.Flags == nvsandboxutils.NV_FILE_FLAG_CONTENT { + targetPath, ret := d.lib.GetFileContent(info.Path) + if ret != nvsandboxutils.SUCCESS { + return nil, fmt.Errorf("failed to get symlink: %w", ret) + } + d.deviceLinks = append(d.deviceLinks, fmt.Sprintf("%v::%v", targetPath, info.Path)) + } + } + } + + return devices, nil +} + +// Hooks returns a hook to create the by-path symlinks for the discovered devices. +func (d *nvsandboxutilsDGPU) Hooks() ([]discover.Hook, error) { + if len(d.deviceLinks) == 0 { + return nil, nil + } + + var args []string + for _, l := range d.deviceLinks { + args = append(args, "--link", l) + } + + hook := discover.CreateNvidiaCDIHook( + d.nvidiaCDIHookPath, + "create-symlinks", + args..., + ) + + return []discover.Hook{hook}, nil +} + +func (d *nvsandboxutilsDGPU) Mounts() ([]discover.Mount, error) { + return nil, nil +} diff --git a/internal/platform-support/dgpu/nvsandboxutils_test.go b/internal/platform-support/dgpu/nvsandboxutils_test.go new file mode 100644 index 000000000..fb9df1878 --- /dev/null +++ b/internal/platform-support/dgpu/nvsandboxutils_test.go @@ -0,0 +1,174 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "testing" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + mocknvml "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" + mocknvsandboxutils "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils/mock" +) + +func TestNewNvsandboxutilsDGPUDiscoverer(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + nvmllib := &mocknvml.Interface{} + devicelib := device.New( + nvmllib, + ) + + testCases := []struct { + description string + devRoot string + device nvml.Device + nvsandboxutils nvsandboxutils.Interface + expectedError error + expectedDevices []discover.Device + expectedHooks []discover.Hook + expectedMounts []discover.Mount + }{ + { + description: "detects host devices", + device: &mocknvml.Device{ + GetUUIDFunc: func() (string, nvml.Return) { + return "GPU-1234", nvml.SUCCESS + }, + }, + nvsandboxutils: &mocknvsandboxutils.Interface{ + GetGpuResourceFunc: func(s string) ([]nvsandboxutils.GpuFileInfo, nvsandboxutils.Ret) { + infos := []nvsandboxutils.GpuFileInfo{ + { + Path: "/dev/nvidia0", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/dev/nvidiactl", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/dev/nvidia-uvm", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/dev/nvidia-uvm-tools", + Type: nvsandboxutils.NV_DEV, + }, + } + return infos, nvsandboxutils.SUCCESS + }, + }, + expectedDevices: []discover.Device{ + { + Path: "/dev/nvidia0", + HostPath: "/dev/nvidia0", + }, + { + Path: "/dev/nvidiactl", + HostPath: "/dev/nvidiactl", + }, + { + Path: "/dev/nvidia-uvm", + HostPath: "/dev/nvidia-uvm", + }, + { + Path: "/dev/nvidia-uvm-tools", + HostPath: "/dev/nvidia-uvm-tools", + }, + }, + }, + { + description: "detects container devices", + devRoot: "/some/root", + device: &mocknvml.Device{ + GetUUIDFunc: func() (string, nvml.Return) { + return "GPU-1234", nvml.SUCCESS + }, + }, + nvsandboxutils: &mocknvsandboxutils.Interface{ + GetGpuResourceFunc: func(s string) ([]nvsandboxutils.GpuFileInfo, nvsandboxutils.Ret) { + infos := []nvsandboxutils.GpuFileInfo{ + { + Path: "/some/root/dev/nvidia0", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/some/root/dev/nvidiactl", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/some/root/dev/nvidia-uvm", + Type: nvsandboxutils.NV_DEV, + }, + { + Path: "/some/root/dev/nvidia-uvm-tools", + Type: nvsandboxutils.NV_DEV, + }, + } + return infos, nvsandboxutils.SUCCESS + }, + }, + expectedDevices: []discover.Device{ + { + Path: "/dev/nvidia0", + HostPath: "/some/root/dev/nvidia0", + }, + { + Path: "/dev/nvidiactl", + HostPath: "/some/root/dev/nvidiactl", + }, + { + Path: "/dev/nvidia-uvm", + HostPath: "/some/root/dev/nvidia-uvm", + }, + { + Path: "/dev/nvidia-uvm-tools", + HostPath: "/some/root/dev/nvidia-uvm-tools", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + o := &options{ + logger: logger, + devRoot: tc.devRoot, + nvsandboxutilslib: tc.nvsandboxutils, + } + + device, err := devicelib.NewDevice(tc.device) + require.NoError(t, err) + + d, err := o.newNvsandboxutilsDGPUDiscoverer(device) + require.ErrorIs(t, err, tc.expectedError) + + devices, _ := d.Devices() + require.EqualValues(t, tc.expectedDevices, devices) + hooks, _ := d.Hooks() + require.EqualValues(t, tc.expectedHooks, hooks) + mounts, _ := d.Mounts() + require.EqualValues(t, tc.expectedMounts, mounts) + }) + } +} diff --git a/internal/platform-support/dgpu/options.go b/internal/platform-support/dgpu/options.go index 41e4d7a93..2fd1c01bd 100644 --- a/internal/platform-support/dgpu/options.go +++ b/internal/platform-support/dgpu/options.go @@ -19,6 +19,7 @@ package dgpu import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" ) type options struct { @@ -26,10 +27,13 @@ type options struct { devRoot string nvidiaCDIHookPath string + isMigDevice bool // migCaps stores the MIG capabilities for the system. // If MIG is not available, this is nil. migCaps nvcaps.MigCaps migCapsError error + + nvsandboxutilslib nvsandboxutils.Interface } type Option func(*options) @@ -61,3 +65,10 @@ func WithMIGCaps(migCaps nvcaps.MigCaps) Option { l.migCaps = migCaps } } + +// WithNvsandboxuitilsLib sets the nvsandboxutils library implementation. +func WithNvsandboxuitilsLib(nvsandboxutilslib nvsandboxutils.Interface) Option { + return func(l *options) { + l.nvsandboxutilslib = nvsandboxutilslib + } +} diff --git a/pkg/nvcdi/full-gpu-nvml.go b/pkg/nvcdi/full-gpu-nvml.go index 881d102d2..003515ca8 100644 --- a/pkg/nvcdi/full-gpu-nvml.go +++ b/pkg/nvcdi/full-gpu-nvml.go @@ -72,6 +72,7 @@ func (l *nvmllib) newFullGPUDiscoverer(d device.Device) (discover.Discover, erro dgpu.WithDevRoot(l.devRoot), dgpu.WithLogger(l.logger), dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), + dgpu.WithNvsandboxuitilsLib(l.nvsandboxutilslib), ) if err != nil { return nil, fmt.Errorf("failed to create device discoverer: %v", err) diff --git a/pkg/nvcdi/mig-device-nvml.go b/pkg/nvcdi/mig-device-nvml.go index 91fe879c2..5c1a504c2 100644 --- a/pkg/nvcdi/mig-device-nvml.go +++ b/pkg/nvcdi/mig-device-nvml.go @@ -55,6 +55,7 @@ func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) dgpu.WithDevRoot(l.devRoot), dgpu.WithLogger(l.logger), dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), + dgpu.WithNvsandboxuitilsLib(l.nvsandboxutilslib), ) if err != nil { return nil, fmt.Errorf("failed to create device discoverer: %v", err)