diff --git a/cmd/nic-configuration-daemon/main.go b/cmd/nic-configuration-daemon/main.go index 0affeb3..f02c775 100644 --- a/cmd/nic-configuration-daemon/main.go +++ b/cmd/nic-configuration-daemon/main.go @@ -69,7 +69,7 @@ func main() { } hostUtils := host.NewHostUtils() - hostManager := host.NewHostManager(nodeName, hostUtils) + hostManager := host.NewHostManager(nodeName, hostUtils, mgr.GetEventRecorderFor("NicDeviceReconciler")) maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace) if err := initNicFwMap(namespace); err != nil { diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index e176daf..1d4f84c 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -52,6 +52,7 @@ const ( NvParamTrue = "1" NvParamLinkTypeInfiniband = "1" NvParamLinkTypeEthernet = "2" + NvParamZero = "0" SriovEnabledParam = "SRIOV_EN" SriovNumOfVfsParam = "NUM_OF_VFS" diff --git a/pkg/host/configvalidation.go b/pkg/host/configvalidation.go index 38d4164..80f268d 100644 --- a/pkg/host/configvalidation.go +++ b/pkg/host/configvalidation.go @@ -16,7 +16,10 @@ limitations under the License. package host import ( + "errors" "fmt" + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" "reflect" "slices" "strconv" @@ -51,7 +54,8 @@ type configValidation interface { } type configValidationImpl struct { - utils HostUtils + utils HostUtils + eventRecorder record.EventRecorder } func nvParamLinkTypeFromName(linkType string) string { @@ -82,7 +86,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate( template := device.Spec.Configuration.Template secondPortPresent := len(device.Status.Ports) > 1 - pciAddr := device.Status.Ports[0].PCI desiredParameters[consts.SriovEnabledParam] = consts.NvParamFalse desiredParameters[consts.SriovNumOfVfsParam] = "0" @@ -118,25 +121,35 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate( if template.PciPerformanceOptimized.MaxAccOutRead != 0 { desiredParameters[consts.MaxAccOutReadParam] = strconv.Itoa(template.PciPerformanceOptimized.MaxAccOutRead) } else { - // If not specified, use the best parameters for specific PCI gen - pciLinkSpeed, err := v.utils.GetPCILinkSpeed(pciAddr) - if err != nil { - log.Log.Error(err, "failed to get PCI link speed", "pciAddr", pciAddr) - return desiredParameters, err - } - // According to the PRM for Gen5 we need to set to enable the auto managing mode. - // However, we are observing that FW doesn't load after reboot in this case. - // Using max values for generations until fix is available. - if pciLinkSpeed == 16 { // Gen4 - desiredParameters[consts.MaxAccOutReadParam] = "64" - } else if pciLinkSpeed > 16 { // Gen5 - desiredParameters[consts.MaxAccOutReadParam] = "128" + // MAX_ACC_OUT_READ parameter is hidden if ADVANCED_PCI_SETTINGS is disabled + if v.AdvancedPCISettingsEnabled(query) { + values, found := query.DefaultConfig[consts.MaxAccOutReadParam] + if !found { + err := types.IncorrectSpecError( + "Device does not support pci performance nv config parameters") + log.Log.Error(err, "incorrect spec", "device", device.Name, "parameter", consts.MaxAccOutReadParam) + return desiredParameters, err + } + + maxAccOutReadParamDefaultValue := values[len(values)-1] + + // According to the PRM, setting MAX_ACC_OUT_READ to zero enables the auto mode, + // which applies the best suitable optimizations. + // However, there is a bug in certain FW versions, where the zero value is not available. + // In this case, until the fix is available, skipping this parameter and emitting a warning + if maxAccOutReadParamDefaultValue == consts.NvParamZero { + applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query) + } else { + warning := fmt.Sprintf("%s nv config parameter does not work properly on this version of FW, skipping it", consts.MaxAccOutReadParam) + if v.eventRecorder != nil { + v.eventRecorder.Event(device, v1.EventTypeWarning, "FirmwareError", warning) + } + log.Log.Error(errors.New(warning), "device", device.Name, "fw version", device.Status.FirmwareVersion) + } } } // maxReadRequest is applied as runtime configuration - } else { - applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query) } if template.RoceOptimized != nil && template.RoceOptimized.Enabled { @@ -158,15 +171,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate( } // qos settings are applied as runtime configuration - } else { - applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP1Param, desiredParameters, query) - applyDefaultNvConfigValueIfExists(consts.CnpDscpP1Param, desiredParameters, query) - applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP1Param, desiredParameters, query) - if secondPortPresent { - applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP2Param, desiredParameters, query) - applyDefaultNvConfigValueIfExists(consts.CnpDscpP2Param, desiredParameters, query) - applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP2Param, desiredParameters, query) - } } if template.GpuDirectOptimized != nil && template.GpuDirectOptimized.Enabled { @@ -183,8 +187,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate( log.Log.Error(err, "incorrect spec", "device", device.Name) return desiredParameters, err } - } else { - applyDefaultNvConfigValueIfExists(consts.AtsEnabledParam, desiredParameters, query) } for _, rawParam := range template.RawNvConfig { @@ -333,6 +335,6 @@ func (v *configValidationImpl) CalculateDesiredRuntimeConfig(device *v1alpha1.Ni return maxReadRequestSize, trust, pfc } -func newConfigValidation(utils HostUtils) configValidation { - return &configValidationImpl{utils: utils} +func newConfigValidation(utils HostUtils, eventRecorder record.EventRecorder) configValidation { + return &configValidationImpl{utils: utils, eventRecorder: eventRecorder} } diff --git a/pkg/host/configvalidation_test.go b/pkg/host/configvalidation_test.go index 586130d..945342c 100644 --- a/pkg/host/configvalidation_test.go +++ b/pkg/host/configvalidation_test.go @@ -43,7 +43,7 @@ var _ = Describe("ConfigValidationImpl", func() { }) Describe("ConstructNvParamMapFromTemplate", func() { - It("should return default values if optional config is disabled", func() { + It("should return only linkType and sriovEnabled values if optional config is disabled", func() { device := &v1alpha1.NicDevice{ Spec: v1alpha1.NicDeviceSpec{ Configuration: &v1alpha1.NicDeviceConfigurationSpec{ @@ -62,14 +62,15 @@ var _ = Describe("ConfigValidationImpl", func() { } defaultValues := map[string][]string{ - consts.MaxAccOutReadParam: {"testMaxAccOutRead"}, - consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"}, - consts.CnpDscpP1Param: {"testDscpP1"}, - consts.Cnp802pPrioP1Param: {"test802PrioP1"}, - consts.RoceCcPrioMaskP2Param: {"testRoceCcP2"}, - consts.CnpDscpP2Param: {"testDscpP2"}, - consts.Cnp802pPrioP2Param: {"test802PrioP2"}, - consts.AtsEnabledParam: {"testAts"}, + consts.MaxAccOutReadParam: {"testMaxAccOutRead"}, + consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"}, + consts.CnpDscpP1Param: {"testDscpP1"}, + consts.Cnp802pPrioP1Param: {"test802PrioP1"}, + consts.RoceCcPrioMaskP2Param: {"testRoceCcP2"}, + consts.CnpDscpP2Param: {"testDscpP2"}, + consts.Cnp802pPrioP2Param: {"test802PrioP2"}, + consts.AtsEnabledParam: {"testAts"}, + consts.AdvancedPCISettingsParam: {consts.NvParamTrue}, } query := types.NewNvConfigQuery() query.DefaultConfig = defaultValues @@ -78,62 +79,67 @@ var _ = Describe("ConfigValidationImpl", func() { Expect(err).NotTo(HaveOccurred()) Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse)) Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0")) - Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "testRoceCcP2")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "testDscpP2")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "test802PrioP2")) - Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "testAts")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "testRoceCcP2")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.CnpDscpP2Param, "testDscpP2")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "test802PrioP2")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.AtsEnabledParam, "testAts")) }) - It("should omit parameters for the second port if device is single port", func() { + It("should construct the correct nvparam map with optional optimizations enabled", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + device := &v1alpha1.NicDevice{ Spec: v1alpha1.NicDeviceSpec{ Configuration: &v1alpha1.NicDeviceConfigurationSpec{ Template: &v1alpha1.ConfigurationTemplateSpec{ NumVfs: 0, LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + MaxAccOutRead: 1337, + MaxReadRequest: 1339, + }, + GpuDirectOptimized: &v1alpha1.GpuDirectOptimizedSpec{ + Enabled: true, + Env: consts.EnvBaremetal, + }, + RoceOptimized: &v1alpha1.RoceOptimizedSpec{ + Enabled: true, + Qos: &v1alpha1.QosSpec{ + Trust: "testTrust", + PFC: "testPFC", + }, + }, }, }, }, Status: v1alpha1.NicDeviceStatus{ Ports: []v1alpha1.NicDevicePortSpec{ {PCI: "0000:03:00.0"}, + {PCI: "0000:03:00.1"}, }, }, } - defaultValues := map[string][]string{ - consts.MaxAccOutReadParam: {"testMaxAccOutRead"}, - consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"}, - consts.CnpDscpP1Param: {"testDscpP1"}, - consts.Cnp802pPrioP1Param: {"test802PrioP1"}, - consts.RoceCcPrioMaskP2Param: {"testRoceCcP2"}, - consts.CnpDscpP2Param: {"testDscpP2"}, - consts.Cnp802pPrioP2Param: {"test802PrioP2"}, - consts.AtsEnabledParam: {"testAts"}, - } + query := types.NewNvConfigQuery() - query.DefaultConfig = defaultValues nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) Expect(err).NotTo(HaveOccurred()) - Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse)) - Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0")) - Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead")) - Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "testAts")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1")) - Expect(nvParams).To(Not(HaveKey(consts.RoceCcPrioMaskP2Param))) - Expect(nvParams).To(Not(HaveKey(consts.CnpDscpP2Param))) - Expect(nvParams).To(Not(HaveKey(consts.Cnp802pPrioP2Param))) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "1337")) + Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "0")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "255")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "4")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "6")) + Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "255")) + Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "4")) + Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "6")) }) - It("should construct the correct nvparam map with optional optimizations enabled", func() { - mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) - + It("should omit parameters for the second port if device is single port", func() { device := &v1alpha1.NicDevice{ Spec: v1alpha1.NicDeviceSpec{ Configuration: &v1alpha1.NicDeviceConfigurationSpec{ @@ -162,12 +168,22 @@ var _ = Describe("ConfigValidationImpl", func() { Status: v1alpha1.NicDeviceStatus{ Ports: []v1alpha1.NicDevicePortSpec{ {PCI: "0000:03:00.0"}, - {PCI: "0000:03:00.1"}, }, }, } + defaultValues := map[string][]string{ + consts.MaxAccOutReadParam: {"0"}, + consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"}, + consts.CnpDscpP1Param: {"testDscpP1"}, + consts.Cnp802pPrioP1Param: {"test802PrioP1"}, + consts.RoceCcPrioMaskP2Param: {"testRoceCcP2"}, + consts.CnpDscpP2Param: {"testDscpP2"}, + consts.Cnp802pPrioP2Param: {"test802PrioP2"}, + consts.AtsEnabledParam: {"testAts"}, + } query := types.NewNvConfigQuery() + query.DefaultConfig = defaultValues nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) Expect(err).NotTo(HaveOccurred()) @@ -176,12 +192,12 @@ var _ = Describe("ConfigValidationImpl", func() { Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "255")) Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "4")) Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "6")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "255")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "4")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "6")) + Expect(nvParams).ToNot(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "255")) + Expect(nvParams).ToNot(HaveKeyWithValue(consts.CnpDscpP2Param, "4")) + Expect(nvParams).ToNot(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "6")) }) - It("should return the correct MaxAccOutRead param for PCIgen4", func() { + It("should skip the MaxAccOutRead if the default is not 0", func() { mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) device := &v1alpha1.NicDevice{ @@ -203,15 +219,19 @@ var _ = Describe("ConfigValidationImpl", func() { }, } + defaultValues := map[string][]string{ + consts.MaxAccOutReadParam: {"notZero"}, + } query := types.NewNvConfigQuery() + query.DefaultConfig = defaultValues nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) Expect(err).NotTo(HaveOccurred()) - Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "64")) + Expect(nvParams).NotTo(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero)) }) - It("should return the correct MaxAccOutRead param for PCIgen5", func() { - mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(64, nil) + It("should apply MaxAccOutRead if the default is 0", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) device := &v1alpha1.NicDevice{ Spec: v1alpha1.NicDeviceSpec{ @@ -232,12 +252,51 @@ var _ = Describe("ConfigValidationImpl", func() { }, } + defaultValues := map[string][]string{ + consts.MaxAccOutReadParam: {consts.NvParamZero}, + } + currentValues := map[string][]string{ + consts.AdvancedPCISettingsParam: {consts.NvParamTrue}, + } query := types.NewNvConfigQuery() + query.DefaultConfig = defaultValues + query.CurrentConfig = currentValues nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) Expect(err).NotTo(HaveOccurred()) - Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "128")) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero)) }) + + It("should not apply MaxAccOutRead if the default is unavailable", func() { + mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) + + device := &v1alpha1.NicDevice{ + Spec: v1alpha1.NicDeviceSpec{ + Configuration: &v1alpha1.NicDeviceConfigurationSpec{ + Template: &v1alpha1.ConfigurationTemplateSpec{ + NumVfs: 0, + LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + }, + }, + }, + }, + Status: v1alpha1.NicDeviceStatus{ + Ports: []v1alpha1.NicDevicePortSpec{ + {PCI: "0000:03:00.0"}, + }, + }, + } + + // MAX_ACC_OUT_READ param is unavailable if ADVANCED_PCI_SETTINGS is disabled + query := types.NewNvConfigQuery() + + nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) + Expect(err).NotTo(HaveOccurred()) + Expect(nvParams).ToNot(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero)) + }) + It("should return an error when GpuOptimized is enabled without PciPerformanceOptimized", func() { mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil) @@ -483,6 +542,9 @@ var _ = Describe("ConfigValidationImpl", func() { Template: &v1alpha1.ConfigurationTemplateSpec{ NumVfs: 0, LinkType: consts.Ethernet, + PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{ + Enabled: true, + }, }, }, }, @@ -495,30 +557,20 @@ var _ = Describe("ConfigValidationImpl", func() { } defaultValues := map[string][]string{ - consts.MaxAccOutReadParam: {"testMaxAccOutRead", "1"}, - consts.RoceCcPrioMaskP1Param: {"testRoceCcP1", "2"}, - consts.CnpDscpP1Param: {"testDscpP1", "3"}, - consts.Cnp802pPrioP1Param: {"test802PrioP1", "4"}, - consts.RoceCcPrioMaskP2Param: {"testRoceCcP2", "5"}, - consts.CnpDscpP2Param: {"testDscpP2", "6"}, - consts.Cnp802pPrioP2Param: {"test802PrioP2", "7"}, - consts.AtsEnabledParam: {"testAts", "8"}, + consts.MaxAccOutReadParam: {"testMaxAccOutRead", "0"}, + } + currentValues := map[string][]string{ + consts.AdvancedPCISettingsParam: {"testAdvancedPCISettings", "1"}, } query := types.NewNvConfigQuery() query.DefaultConfig = defaultValues + query.CurrentConfig = currentValues nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query) Expect(err).NotTo(HaveOccurred()) Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse)) Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0")) - Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "1")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "2")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "3")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "4")) - Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "5")) - Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "6")) - Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "7")) - Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "8")) + Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "0")) }) }) diff --git a/pkg/host/host.go b/pkg/host/host.go index 231d4be..7309b32 100644 --- a/pkg/host/host.go +++ b/pkg/host/host.go @@ -18,6 +18,7 @@ package host import ( "context" "fmt" + "k8s.io/client-go/tools/record" "slices" "strconv" "strings" @@ -327,6 +328,6 @@ func (h hostManager) DiscoverOfedVersion() (string, error) { return h.hostUtils.GetOfedVersion() } -func NewHostManager(nodeName string, hostUtils HostUtils) HostManager { - return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils)} +func NewHostManager(nodeName string, hostUtils HostUtils, eventRecorder record.EventRecorder) HostManager { + return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils, eventRecorder)} }