Skip to content

Commit

Permalink
fix: Skip MAX_ACC_OUT_READ on the broken FW, not set the default valu…
Browse files Browse the repository at this point in the history
…es if optimizations are disabled

1) According to the PRM, setting MAX_ACC_OUT_READ to zero enables the auto mode,
which applies the best suitable optimizations.
However, there is a bug in certain FW versions, where the zero value is not available.
In this case, until the fix is available, skipping this parameter and emitting a warning

2) Don't set the default values for PCI, RoCE and GPU optimizations
if they are not enabled in spec

Signed-off-by: Alexander Maslennikov <[email protected]>
  • Loading branch information
almaslennikov committed Nov 1, 2024
1 parent 0181723 commit f60c301
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 100 deletions.
2 changes: 1 addition & 1 deletion cmd/nic-configuration-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func main() {
}

hostUtils := host.NewHostUtils()
hostManager := host.NewHostManager(nodeName, hostUtils)
hostManager := host.NewHostManager(nodeName, hostUtils, mgr.GetEventRecorderFor("NicDeviceReconciler"))
maintenanceManager := maintenance.New(mgr.GetClient(), hostUtils, nodeName, namespace)

if err := initNicFwMap(namespace); err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
NvParamTrue = "1"
NvParamLinkTypeInfiniband = "1"
NvParamLinkTypeEthernet = "2"
NvParamZero = "0"

SriovEnabledParam = "SRIOV_EN"
SriovNumOfVfsParam = "NUM_OF_VFS"
Expand Down
62 changes: 32 additions & 30 deletions pkg/host/configvalidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ limitations under the License.
package host

import (
"errors"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
"reflect"
"slices"
"strconv"
Expand Down Expand Up @@ -51,7 +54,8 @@ type configValidation interface {
}

type configValidationImpl struct {
utils HostUtils
utils HostUtils
eventRecorder record.EventRecorder
}

func nvParamLinkTypeFromName(linkType string) string {
Expand Down Expand Up @@ -82,7 +86,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(

template := device.Spec.Configuration.Template
secondPortPresent := len(device.Status.Ports) > 1
pciAddr := device.Status.Ports[0].PCI

desiredParameters[consts.SriovEnabledParam] = consts.NvParamFalse
desiredParameters[consts.SriovNumOfVfsParam] = "0"
Expand Down Expand Up @@ -118,25 +121,35 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(
if template.PciPerformanceOptimized.MaxAccOutRead != 0 {
desiredParameters[consts.MaxAccOutReadParam] = strconv.Itoa(template.PciPerformanceOptimized.MaxAccOutRead)
} else {
// If not specified, use the best parameters for specific PCI gen
pciLinkSpeed, err := v.utils.GetPCILinkSpeed(pciAddr)
if err != nil {
log.Log.Error(err, "failed to get PCI link speed", "pciAddr", pciAddr)
return desiredParameters, err
}
// According to the PRM for Gen5 we need to set to enable the auto managing mode.
// However, we are observing that FW doesn't load after reboot in this case.
// Using max values for generations until fix is available.
if pciLinkSpeed == 16 { // Gen4
desiredParameters[consts.MaxAccOutReadParam] = "64"
} else if pciLinkSpeed > 16 { // Gen5
desiredParameters[consts.MaxAccOutReadParam] = "128"
// MAX_ACC_OUT_READ parameter is hidden if ADVANCED_PCI_SETTINGS is disabled
if v.AdvancedPCISettingsEnabled(query) {
values, found := query.DefaultConfig[consts.MaxAccOutReadParam]
if !found {
err := types.IncorrectSpecError(
"Device does not support pci performance nv config parameters")
log.Log.Error(err, "incorrect spec", "device", device.Name, "parameter", consts.MaxAccOutReadParam)
return desiredParameters, err
}

maxAccOutReadParamDefaultValue := values[len(values)-1]

// According to the PRM, setting MAX_ACC_OUT_READ to zero enables the auto mode,
// which applies the best suitable optimizations.
// However, there is a bug in certain FW versions, where the zero value is not available.
// In this case, until the fix is available, skipping this parameter and emitting a warning
if maxAccOutReadParamDefaultValue == consts.NvParamZero {
applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query)
} else {
warning := fmt.Sprintf("%s nv config parameter does not work properly on this version of FW, skipping it", consts.MaxAccOutReadParam)
if v.eventRecorder != nil {
v.eventRecorder.Event(device, v1.EventTypeWarning, "FirmwareError", warning)
}
log.Log.Error(errors.New(warning), "device", device.Name, "fw version", device.Status.FirmwareVersion)
}
}
}

// maxReadRequest is applied as runtime configuration
} else {
applyDefaultNvConfigValueIfExists(consts.MaxAccOutReadParam, desiredParameters, query)
}

if template.RoceOptimized != nil && template.RoceOptimized.Enabled {
Expand All @@ -158,15 +171,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(
}

// qos settings are applied as runtime configuration
} else {
applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP1Param, desiredParameters, query)
applyDefaultNvConfigValueIfExists(consts.CnpDscpP1Param, desiredParameters, query)
applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP1Param, desiredParameters, query)
if secondPortPresent {
applyDefaultNvConfigValueIfExists(consts.RoceCcPrioMaskP2Param, desiredParameters, query)
applyDefaultNvConfigValueIfExists(consts.CnpDscpP2Param, desiredParameters, query)
applyDefaultNvConfigValueIfExists(consts.Cnp802pPrioP2Param, desiredParameters, query)
}
}

if template.GpuDirectOptimized != nil && template.GpuDirectOptimized.Enabled {
Expand All @@ -183,8 +187,6 @@ func (v *configValidationImpl) ConstructNvParamMapFromTemplate(
log.Log.Error(err, "incorrect spec", "device", device.Name)
return desiredParameters, err
}
} else {
applyDefaultNvConfigValueIfExists(consts.AtsEnabledParam, desiredParameters, query)
}

for _, rawParam := range template.RawNvConfig {
Expand Down Expand Up @@ -333,6 +335,6 @@ func (v *configValidationImpl) CalculateDesiredRuntimeConfig(device *v1alpha1.Ni
return maxReadRequestSize, trust, pfc
}

func newConfigValidation(utils HostUtils) configValidation {
return &configValidationImpl{utils: utils}
func newConfigValidation(utils HostUtils, eventRecorder record.EventRecorder) configValidation {
return &configValidationImpl{utils: utils, eventRecorder: eventRecorder}
}
Loading

0 comments on commit f60c301

Please sign in to comment.