Skip to content

Commit

Permalink
fix: Skip MAX_ACC_OUT_READ on the broken FW versions
Browse files Browse the repository at this point in the history
According to the PRM, setting MAX_ACC_OUT_READ to zero enables the auto mode,
which applies the best suitable optimizations.
However, there is a bug in certain FW versions, where the zero value is not available.
In this case, until the fix is available, skipping this parameter and emitting a warning

Signed-off-by: Alexander Maslennikov <[email protected]>
  • Loading branch information
almaslennikov committed Nov 1, 2024
1 parent de1a58e commit 579080d
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 45 deletions.
2 changes: 1 addition & 1 deletion cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func main() {
}

hostUtils := host.NewHostUtils()
hostManager := host.NewHostManager(nodeName, hostUtils)
hostManager := host.NewHostManager(nodeName, hostUtils, mgr.GetEventRecorderFor("NicDeviceReconciler"))
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

if err := initNicFwMap(namespace); err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
NvParamTrue = "1"
NvParamLinkTypeInfiniband = "1"
NvParamLinkTypeEthernet = "2"
NvParamZero = "0"

SriovEnabledParam = "SRIOV_EN"
SriovNumOfVfsParam = "NUM_OF_VFS"
Expand Down
48 changes: 32 additions & 16 deletions pkg/host/configvalidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ limitations under the License.
package host

import (
"errors"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
"reflect"
"slices"
"strconv"
Expand Down Expand Up @@ -51,7 +54,8 @@ type configValidation interface {
}

type configValidationImpl struct {
utils HostUtils
utils HostUtils
eventRecorder record.EventRecorder
}

func nvParamLinkTypeFromName(linkType string) string {
Expand Down Expand Up @@ -82,7 +86,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(

template := device.Spec.Configuration.Template
secondPortPresent := len(device.Status.Ports) > 1
pciAddr := device.Status.Ports[0].PCI

desiredParameters[consts.SriovEnabledParam] = consts.NvParamFalse
desiredParameters[consts.SriovNumOfVfsParam] = "0"
Expand Down Expand Up @@ -118,22 +121,35 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(
if template.PciPerformanceOptimized.MaxAccOutRead != 0 {
desiredParameters[consts.MaxAccOutReadParam] = strconv.Itoa(template.PciPerformanceOptimized.MaxAccOutRead)
} else {
// If not specified, use the best parameters for specific PCI gen
pciLinkSpeed, err := v.utils.GetPCILinkSpeed(pciAddr)
if err != nil {
log.Log.Error(err, "failed to get PCI link speed", "pciAddr", pciAddr)
return desiredParameters, err
}
if pciLinkSpeed == 16 { // Gen4
desiredParameters[consts.MaxAccOutReadParam] = "44"
} else if pciLinkSpeed > 16 { // Gen5
desiredParameters[consts.MaxAccOutReadParam] = "0"
// MAX_ACC_OUT_READ parameter is hidden if ADVANCED_PCI_SETTINGS is disabled
if v.AdvancedPCISettingsEnabled(query) {
values, found := query.DefaultConfig[consts.MaxAccOutReadParam]
if !found {
err := types.IncorrectSpecError(
"Device does not support pci performance nv config parameters")
log.Log.Error(err, "incorrect spec", "device", device.Name, "parameter", consts.MaxAccOutReadParam)
return desiredParameters, err
}

maxAccOutReadParamDefaultValue := values[len(values)-1]

// According to the PRM, setting MAX_ACC_OUT_READ to zero enables the auto mode,
// which applies the best suitable optimizations.
// However, there is a bug in certain FW versions, where the zero value is not available.
// In this case, until the fix is available, skipping this parameter and emitting a warning
if maxAccOutReadParamDefaultValue == consts.NvParamZero {
applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query)
} else {
warning := fmt.Sprintf("%s nv config parameter does not work properly on this version of FW, skipping it", consts.MaxAccOutReadParam)
if v.eventRecorder != nil {
v.eventRecorder.Event(device, v1.EventTypeWarning, "FirmwareError", warning)
}
log.Log.Error(errors.New(warning), "device", device.Name, "fw version", device.Status.FirmwareVersion)
}
}
}

// maxReadRequest is applied as runtime configuration
} else {
applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query)
}

if template.RoceOptimized != nil && template.RoceOptimized.Enabled {
Expand Down Expand Up @@ -330,6 +346,6 @@ func (v *configValidationImpl) CalculateDesiredRuntimeConfig(device *v1alpha1.Ni
return maxReadRequestSize, trust, pfc
}

func newConfigValidation(utils HostUtils) configValidation {
return &configValidationImpl{utils: utils}
func newConfigValidation(utils HostUtils, eventRecorder record.EventRecorder) configValidation {
return &configValidationImpl{utils: utils, eventRecorder: eventRecorder}
}
83 changes: 57 additions & 26 deletions pkg/host/configvalidation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ var _ = Describe("ConfigValidationImpl", func() {
}

defaultValues := map[string][]string{
consts.MaxAccOutReadParam: {"testMaxAccOutRead"},
consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"},
consts.CnpDscpP1Param: {"testDscpP1"},
consts.Cnp802pPrioP1Param: {"test802PrioP1"},
Expand All @@ -73,12 +72,10 @@ var _ = Describe("ConfigValidationImpl", func() {
}
query := types.NewNvConfigQuery()
query.DefaultConfig = defaultValues

nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query)
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse))
Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0"))
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead"))
Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1"))
Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1"))
Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "test802PrioP1"))
Expand All @@ -105,7 +102,6 @@ var _ = Describe("ConfigValidationImpl", func() {
},
}
defaultValues := map[string][]string{
consts.MaxAccOutReadParam: {"testMaxAccOutRead"},
consts.RoceCcPrioMaskP1Param: {"testRoceCcP1"},
consts.CnpDscpP1Param: {"testDscpP1"},
consts.Cnp802pPrioP1Param: {"test802PrioP1"},
Expand All @@ -121,7 +117,6 @@ var _ = Describe("ConfigValidationImpl", func() {
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse))
Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0"))
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "testMaxAccOutRead"))
Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "testAts"))
Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "testRoceCcP1"))
Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "testDscpP1"))
Expand Down Expand Up @@ -181,7 +176,7 @@ var _ = Describe("ConfigValidationImpl", func() {
Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "6"))
})

It("should return the correct MaxAccOutRead param for PCIgen4", func() {
It("should skip the MaxAccOutRead if the default is not 0", func() {
mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil)

device := &v1alpha1.NicDevice{
Expand All @@ -203,15 +198,19 @@ var _ = Describe("ConfigValidationImpl", func() {
},
}

defaultValues := map[string][]string{
consts.MaxAccOutReadParam: {"notZero"},
}
query := types.NewNvConfigQuery()
query.DefaultConfig = defaultValues

nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query)
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "44"))
Expect(nvParams).NotTo(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero))
})

It("should return the correct MaxAccOutRead param for PCIgen5", func() {
mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(64, nil)
It("should apply MaxAccOutRead if the default is 0", func() {
mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil)

device := &v1alpha1.NicDevice{
Spec: v1alpha1.NicDeviceSpec{
Expand All @@ -232,12 +231,51 @@ var _ = Describe("ConfigValidationImpl", func() {
},
}

defaultValues := map[string][]string{
consts.MaxAccOutReadParam: {consts.NvParamZero},
}
currentValues := map[string][]string{
consts.AdvancedPCISettingsParam: {consts.NvParamTrue},
}
query := types.NewNvConfigQuery()
query.DefaultConfig = defaultValues
query.CurrentConfig = currentValues

nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query)
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "0"))
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero))
})

It("should not apply MaxAccOutRead if the default is unavailable", func() {
mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil)

device := &v1alpha1.NicDevice{
Spec: v1alpha1.NicDeviceSpec{
Configuration: &v1alpha1.NicDeviceConfigurationSpec{
Template: &v1alpha1.ConfigurationTemplateSpec{
NumVfs: 0,
LinkType: consts.Ethernet,
PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{
Enabled: true,
},
},
},
},
Status: v1alpha1.NicDeviceStatus{
Ports: []v1alpha1.NicDevicePortSpec{
{PCI: "0000:03:00.0"},
},
},
}

// MAX_ACC_OUT_READ param is unavailable if ADVANCED_PCI_SETTINGS is disabled
query := types.NewNvConfigQuery()

nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query)
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).ToNot(HaveKeyWithValue(consts.MaxAccOutReadParam, consts.NvParamZero))
})

It("should return an error when GpuOptimized is enabled without PciPerformanceOptimized", func() {
mockHostUtils.On("GetPCILinkSpeed", mock.Anything).Return(16, nil)

Expand Down Expand Up @@ -483,6 +521,9 @@ var _ = Describe("ConfigValidationImpl", func() {
Template: &v1alpha1.ConfigurationTemplateSpec{
NumVfs: 0,
LinkType: consts.Ethernet,
PciPerformanceOptimized: &v1alpha1.PciPerformanceOptimizedSpec{
Enabled: true,
},
},
},
},
Expand All @@ -495,30 +536,20 @@ var _ = Describe("ConfigValidationImpl", func() {
}

defaultValues := map[string][]string{
consts.MaxAccOutReadParam: {"testMaxAccOutRead", "1"},
consts.RoceCcPrioMaskP1Param: {"testRoceCcP1", "2"},
consts.CnpDscpP1Param: {"testDscpP1", "3"},
consts.Cnp802pPrioP1Param: {"test802PrioP1", "4"},
consts.RoceCcPrioMaskP2Param: {"testRoceCcP2", "5"},
consts.CnpDscpP2Param: {"testDscpP2", "6"},
consts.Cnp802pPrioP2Param: {"test802PrioP2", "7"},
consts.AtsEnabledParam: {"testAts", "8"},
consts.MaxAccOutReadParam: {"testMaxAccOutRead", "0"},
}
currentValues := map[string][]string{
consts.AdvancedPCISettingsParam: {"testAdvancedPCISettings", "1"},
}
query := types.NewNvConfigQuery()
query.DefaultConfig = defaultValues
query.CurrentConfig = currentValues

nvParams, err := validator.ConstructNvParamMapFromTemplate(device, query)
Expect(err).NotTo(HaveOccurred())
Expect(nvParams).To(HaveKeyWithValue(consts.SriovEnabledParam, consts.NvParamFalse))
Expect(nvParams).To(HaveKeyWithValue(consts.SriovNumOfVfsParam, "0"))
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "1"))
Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP1Param, "2"))
Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP1Param, "3"))
Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP1Param, "4"))
Expect(nvParams).To(HaveKeyWithValue(consts.RoceCcPrioMaskP2Param, "5"))
Expect(nvParams).To(HaveKeyWithValue(consts.CnpDscpP2Param, "6"))
Expect(nvParams).To(HaveKeyWithValue(consts.Cnp802pPrioP2Param, "7"))
Expect(nvParams).To(HaveKeyWithValue(consts.AtsEnabledParam, "8"))
Expect(nvParams).To(HaveKeyWithValue(consts.MaxAccOutReadParam, "0"))
})
})

Expand Down
5 changes: 3 additions & 2 deletions pkg/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package host
import (
"context"
"fmt"
"k8s.io/client-go/tools/record"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -327,6 +328,6 @@ func (h hostManager) DiscoverOfedVersion() (string, error) {
return h.hostUtils.GetOfedVersion()
}

func NewHostManager(nodeName string, hostUtils HostUtils) HostManager {
return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils)}
func NewHostManager(nodeName string, hostUtils HostUtils, eventRecorder record.EventRecorder) HostManager {
return hostManager{nodeName: nodeName, hostUtils: hostUtils, configValidation: newConfigValidation(hostUtils, eventRecorder)}
}

0 comments on commit 579080d

Please sign in to comment.