Skip to content

Commit

Permalink
fix(*): empty cpuset.mems will lead to failure of tmo
Browse files Browse the repository at this point in the history
Signed-off-by: linzhecheng <[email protected]>
  • Loading branch information
cheney-lin committed Nov 27, 2024
1 parent 19aa317 commit c909ef8
Show file tree
Hide file tree
Showing 7 changed files with 326 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -736,13 +736,9 @@ func (p *DynamicPolicy) handleAdvisorMemoryOffloading(_ *config.Configuration,
}
}

cpuSetStats, err := cgroupmgr.GetCPUSetWithAbsolutePath(absCGPath)
_, mems, err := cgroupmgr.GetEffectiveCPUSetWithAbsolutePath(absCGPath)
if err != nil {
return fmt.Errorf("GetCPUSetWithAbsolutePath failed with error: %v", err)
}
mems, err := machine.Parse(cpuSetStats.Mems)
if err != nil {
return fmt.Errorf("parse cpuSetStats failed with error: %v", err)
return fmt.Errorf("GetEffectiveCPUSetWithAbsolutePath failed with error: %v", err)
}

// start a asynchronous work to execute memory offloading
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,11 @@ func (p *nodeMetricsReporterPlugin) getGroupUsage(pods []*v1.Pod, qosLevel strin
resourceMetric.CPU = aggCPU
}

for numaID, resourceUsages := range numaUsages {
for numaID := 0; numaID < p.metaServer.NumNUMANodes; numaID++ {
resourceUsages, ok := numaUsages[numaID]
if !ok {
continue
}
resourceNUMAMetric := nodeapis.ResourceMetric{}

cpuUsage := resourceUsages[v1.ResourceCPU]
Expand Down
6 changes: 4 additions & 2 deletions pkg/util/cgroup/common/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,10 @@ type CPUStats struct {

// CPUSetStats get cgroup cpuset data
type CPUSetStats struct {
CPUs string
Mems string
CPUs string
EffectiveCPUs string
Mems string
EffectiveMems string
}

// MemoryMetrics get memory cgroup metrics
Expand Down
51 changes: 51 additions & 0 deletions pkg/util/cgroup/manager/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ import (
"fmt"
"io/fs"
"math"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"time"

"golang.org/x/sys/unix"

"github.com/kubewharf/katalyst-core/pkg/consts"
"github.com/kubewharf/katalyst-core/pkg/metrics"
"github.com/kubewharf/katalyst-core/pkg/util/asyncworker"
Expand Down Expand Up @@ -514,3 +518,50 @@ func MemoryOffloadingWithAbsolutePath(ctx context.Context, absCgroupPath string,

return err
}

func IsCgroupPath(path string) bool {
var fstat syscall.Statfs_t
err := syscall.Statfs(path, &fstat)
if err != nil {
general.ErrorS(err, "failed to Statfs", "path", path)
return false
}
return fstat.Type == unix.CGROUP2_SUPER_MAGIC || fstat.Type == unix.CGROUP_SUPER_MAGIC
}

func GetEffectiveCPUSetWithAbsolutePath(absCgroupPath string) (machine.CPUSet, machine.CPUSet, error) {
if !IsCgroupPath(absCgroupPath) {
return machine.CPUSet{}, machine.CPUSet{}, fmt.Errorf("path %s is not a cgroup", absCgroupPath)
}

cpusetStat, err := GetCPUSetWithAbsolutePath(absCgroupPath)
if err != nil {
// if controller is disabled, we should walk the parent's dir.
if os.IsNotExist(err) {
return GetEffectiveCPUSetWithAbsolutePath(filepath.Dir(absCgroupPath))
}
return machine.CPUSet{}, machine.CPUSet{}, err
}
// if the cpus or mems is empty, they will inherit the parent's mask.
cpus, err := machine.Parse(cpusetStat.EffectiveCPUs)
if err != nil {
return machine.CPUSet{}, machine.CPUSet{}, err
}
if cpus.IsEmpty() {
cpus, _, err = GetEffectiveCPUSetWithAbsolutePath(filepath.Dir(absCgroupPath))
if err != nil {
return machine.CPUSet{}, machine.CPUSet{}, err
}
}
mems, err := machine.Parse(cpusetStat.EffectiveMems)
if err != nil {
return machine.CPUSet{}, machine.CPUSet{}, err
}
if mems.IsEmpty() {
_, mems, err = GetEffectiveCPUSetWithAbsolutePath(filepath.Dir(absCgroupPath))
if err != nil {
return machine.CPUSet{}, machine.CPUSet{}, err
}
}
return cpus, mems, nil
}
Loading

0 comments on commit c909ef8

Please sign in to comment.