From 158b4f01ea095c5a0a25cb55575bcf6ea59ce3a1 Mon Sep 17 00:00:00 2001 From: Alexander Maslennikov Date: Tue, 5 Nov 2024 17:30:44 +0100 Subject: [PATCH] fix: validate if nv config update applied after reboot Signed-off-by: Alexander Maslennikov --- cmd/nic-configuration-daemon/main.go | 5 +- internal/controller/nicdevice_controller.go | 54 +++++++++++- .../controller/nicdevice_controller_test.go | 84 ++++++++++++++++++- pkg/consts/consts.go | 3 + pkg/host/mocks/HostUtils.go | 30 +++++++ pkg/host/utils.go | 17 ++++ 6 files changed, 188 insertions(+), 5 deletions(-) diff --git a/cmd/nic-configuration-daemon/main.go b/cmd/nic-configuration-daemon/main.go index f02c775..5ee80c5 100644 --- a/cmd/nic-configuration-daemon/main.go +++ b/cmd/nic-configuration-daemon/main.go @@ -68,8 +68,10 @@ func main() { } } + eventRecorder := mgr.GetEventRecorderFor("NicDeviceReconciler") + hostUtils := host.NewHostUtils() - hostManager := host.NewHostManager(nodeName, hostUtils, mgr.GetEventRecorderFor("NicDeviceReconciler")) + hostManager := host.NewHostManager(nodeName, hostUtils, eventRecorder) maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace) if err := initNicFwMap(namespace); err != nil { @@ -90,6 +92,7 @@ func main() { NamespaceName: namespace, HostManager: hostManager, MaintenanceManager: maintenanceManager, + EventRecorder: eventRecorder, } err = nicDeviceReconciler.SetupWithManager(mgr, true) if err != nil { diff --git a/internal/controller/nicdevice_controller.go b/internal/controller/nicdevice_controller.go index adee848..b427f1e 100644 --- a/internal/controller/nicdevice_controller.go +++ b/internal/controller/nicdevice_controller.go @@ -17,9 +17,13 @@ package controller import ( "context" + "errors" "sync" "time" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" + maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -55,7 +59,10 @@ type NicDeviceReconciler struct { NamespaceName string HostManager host.HostManager + HostUtils host.HostUtils MaintenanceManager maintenance.MaintenanceManager + + EventRecorder record.EventRecorder } type nicDeviceConfigurationStatuses []*nicDeviceConfigurationStatus @@ -432,16 +439,57 @@ func (r *NicDeviceReconciler) handleSpecValidation(ctx context.Context, statuses } } } + + status.nvConfigUpdateRequired = nvConfigUpdateRequired + status.rebootRequired = rebootRequired + if nvConfigUpdateRequired { log.Log.V(2).Info("update started for device", "device", status.device.Name) err = r.updateDeviceStatusCondition(ctx, status.device, consts.UpdateStartedReason, metav1.ConditionTrue, "") if err != nil { status.lastStageError = err } - } + } else if rebootRequired { + // There might be a case where FW config didn't apply after a reboot because of some error in FW. In this case + // we don't want the node to be kept in a reboot loop (FW configured -> reboot -> Config was not applied -> FW configured -> etc.). + // To break the reboot loop, we should compare the last time the status was changed to PendingReboot to the node's uptime. + // If the node started after the status was changed, we assume the node was rebooted and the config couldn't apply. + // In this case, we indicate the error to the user with the status change and emit an error event. + statusCondition := meta.FindStatusCondition(status.device.Status.Conditions, consts.ConfigUpdateInProgressCondition) + if statusCondition == nil { + return + } - status.nvConfigUpdateRequired = nvConfigUpdateRequired - status.rebootRequired = rebootRequired + switch statusCondition.Reason { + case consts.PendingRebootReason: + // We need to determine, whether a reboot has happened since the PendingReboot status has been set + uptime, err := r.HostUtils.GetHostUptimeSeconds() + if err != nil { + status.lastStageError = err + return + } + + sinceStatusUpdate := time.Since(statusCondition.LastTransitionTime.Time) + + // If more time has passed since boot than since the status update, the reboot hasn't happened yet + if uptime > sinceStatusUpdate { + return + } + + log.Log.Info("nv config failed to update after reboot for device", "device", status.device.Name) + r.EventRecorder.Event(status.device, v1.EventTypeWarning, consts.FirmwareError, consts.FwConfigNotAppliedAfterRebootErrorMsg) + err = r.updateDeviceStatusCondition(ctx, status.device, consts.FirmwareError, metav1.ConditionFalse, consts.FwConfigNotAppliedAfterRebootErrorMsg) + if err != nil { + status.lastStageError = err + } + + fallthrough + case consts.FirmwareError: + status.nvConfigUpdateRequired = false + status.rebootRequired = false + status.lastStageError = errors.New(consts.FwConfigNotAppliedAfterRebootErrorMsg) + } + } }(i) } diff --git a/internal/controller/nicdevice_controller_test.go b/internal/controller/nicdevice_controller_test.go index ccd9a0e..e222447 100644 --- a/internal/controller/nicdevice_controller_test.go +++ b/internal/controller/nicdevice_controller_test.go @@ -21,6 +21,8 @@ import ( "sync" "time" + "k8s.io/apimachinery/pkg/api/meta" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/stretchr/testify/mock" @@ -52,6 +54,7 @@ var _ = Describe("NicDeviceReconciler", func() { reconciler *NicDeviceReconciler hostManager *hostMocks.HostManager maintenanceManager *maintenanceMocks.MaintenanceManager + hostUtils *hostMocks.HostUtils nodeName = "test-node" deviceName = "test-device" ctx context.Context @@ -99,6 +102,7 @@ var _ = Describe("NicDeviceReconciler", func() { deviceDiscoveryReconcileTime = 1 * time.Second hostManager = &hostMocks.HostManager{} maintenanceManager = &maintenanceMocks.MaintenanceManager{} + hostUtils = &hostMocks.HostUtils{} reconciler = &NicDeviceReconciler{ Client: mgr.GetClient(), @@ -107,6 +111,8 @@ var _ = Describe("NicDeviceReconciler", func() { NamespaceName: namespaceName, HostManager: hostManager, MaintenanceManager: maintenanceManager, + HostUtils: hostUtils, + EventRecorder: mgr.GetEventRecorderFor("testReconciler"), } Expect(reconciler.SetupWithManager(mgr, false)).To(Succeed()) }) @@ -212,7 +218,7 @@ var _ = Describe("NicDeviceReconciler", func() { }) Describe("reconcile a single device", func() { - var createDevice = func(setLastSpecAnnotation bool) { + var createDevice = func(setLastSpecAnnotation bool) *v1alpha1.NicDevice { device := &v1alpha1.NicDevice{ ObjectMeta: metav1.ObjectMeta{Name: deviceName, Namespace: namespaceName}, Spec: v1alpha1.NicDeviceSpec{ @@ -245,6 +251,8 @@ var _ = Describe("NicDeviceReconciler", func() { }}, } Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + + return device } It("Should not reconcile device not from its node", func() { @@ -512,6 +520,80 @@ var _ = Describe("NicDeviceReconciler", func() { hostManager.AssertExpectations(GinkgoT()) maintenanceManager.AssertExpectations(GinkgoT()) }) + It("Should not request another reboot if nv config failed to apply after the first one", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, true, nil) + hostUtils.On("GetHostUptimeSeconds").Return(time.Second*0, nil) + + device := createDevice(false) + + cond := metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + ObservedGeneration: device.Generation, + Reason: consts.PendingRebootReason, + Message: "", + } + meta.SetStatusCondition(&device.Status.Conditions, cond) + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + + startManager() + + Eventually(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.FirmwareError, + Message: consts.FwConfigNotAppliedAfterRebootErrorMsg, + })) + + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "ScheduleMaintenance", mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "MaintenanceAllowed", mock.Anything) + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "Reboot") + maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + hostManager.AssertExpectations(GinkgoT()) + maintenanceManager.AssertExpectations(GinkgoT()) + }) + + It("Should not fail on an not applied nv config when reboot hasn't happened yet", func() { + hostManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(false, true, nil) + hostUtils.On("GetHostUptimeSeconds").Return(time.Second*1000, nil) + + device := createDevice(false) + + cond := metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + ObservedGeneration: device.Generation, + Reason: consts.PendingRebootReason, + Message: "", + } + meta.SetStatusCondition(&device.Status.Conditions, cond) + Expect(k8sClient.Status().Update(ctx, device)).To(Succeed()) + + startManager() + + Consistently(func() []metav1.Condition { + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + return device.Status.Conditions + }, timeout).ShouldNot(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionFalse, + Reason: consts.FirmwareError, + Message: consts.FwConfigNotAppliedAfterRebootErrorMsg, + })) + + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceRuntimeSpec", mock.Anything) + hostManager.AssertNotCalled(GinkgoT(), "ApplyDeviceNvSpec", mock.Anything, mock.Anything) + maintenanceManager.AssertNotCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) + hostManager.AssertExpectations(GinkgoT()) + maintenanceManager.AssertExpectations(GinkgoT()) + }) }) Describe("reconcile multiple devices", func() { var ( diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index 1d4f84c..f740f2a 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -30,6 +30,7 @@ const ( RuntimeConfigUpdateFailedReason = "RuntimeConfigUpdateFailed" UpdateSuccessfulReason = "UpdateSuccessful" SpecValidationFailed = "SpecValidationFailed" + FirmwareError = "FirmwareError" DeviceConfigSpecEmptyReason = "DeviceConfigSpecEmpty" DeviceFwMatchReason = "DeviceFirmwareConfigMatch" @@ -79,4 +80,6 @@ const ( SupportedNicFirmwareConfigmap = "supported-nic-firmware" Mlx5ModuleVersionPath = "/sys/bus/pci/drivers/mlx5_core/module/version" + + FwConfigNotAppliedAfterRebootErrorMsg = "firmware configuration failed to apply after reboot" ) diff --git a/pkg/host/mocks/HostUtils.go b/pkg/host/mocks/HostUtils.go index d4a870a..c011ee8 100644 --- a/pkg/host/mocks/HostUtils.go +++ b/pkg/host/mocks/HostUtils.go @@ -9,6 +9,8 @@ import ( pci "github.com/jaypipes/ghw/pkg/pci" + time "time" + types "github.com/Mellanox/nic-configuration-operator/pkg/types" ) @@ -52,6 +54,34 @@ func (_m *HostUtils) GetFirmwareVersionAndPSID(pciAddr string) (string, string, return r0, r1, r2 } +// GetHostUptimeSeconds provides a mock function with given fields: +func (_m *HostUtils) GetHostUptimeSeconds() (time.Duration, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetHostUptimeSeconds") + } + + var r0 time.Duration + var r1 error + if rf, ok := ret.Get(0).(func() (time.Duration, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() time.Duration); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(time.Duration) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // GetInterfaceName provides a mock function with given fields: pciAddr func (_m *HostUtils) GetInterfaceName(pciAddr string) string { ret := _m.Called(pciAddr) diff --git a/pkg/host/utils.go b/pkg/host/utils.go index 455d5f7..702656b 100644 --- a/pkg/host/utils.go +++ b/pkg/host/utils.go @@ -25,6 +25,7 @@ import ( "strconv" "strings" "syscall" + "time" "github.com/Mellanox/rdmamap" "github.com/jaypipes/ghw" @@ -80,6 +81,8 @@ type HostUtils interface { ScheduleReboot() error // GetOfedVersion retrieves installed OFED version GetOfedVersion() string + // GetHostUptimeSeconds returns the host uptime in seconds + GetHostUptimeSeconds() (time.Duration, error) } type hostUtils struct { @@ -630,6 +633,20 @@ func (h *hostUtils) GetOfedVersion() string { return version } +// GetHostUptimeSeconds returns the host uptime in seconds +func (h *hostUtils) GetHostUptimeSeconds() (time.Duration, error) { + log.Log.V(2).Info("HostUtils.GetHostUptimeSeconds()") + output, err := os.ReadFile("/proc/uptime") + if err != nil { + log.Log.Error(err, "HostUtils.GetHostUptimeSeconds(): failed to read the system's uptime") + return 0, err + } + uptimeStr := strings.Split(string(output), " ")[0] + uptimeSeconds, _ := strconv.ParseFloat(uptimeStr, 64) + + return time.Duration(uptimeSeconds) * time.Second, nil +} + func NewHostUtils() HostUtils { return &hostUtils{execInterface: execUtils.New()} }