Skip to content

Commit

Permalink
kill highest memory egress when OOM (#826)
Browse files Browse the repository at this point in the history
  • Loading branch information
frostbyte73 authored Dec 12, 2024
1 parent b966630 commit 5e2223b
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 27 deletions.
3 changes: 2 additions & 1 deletion pkg/config/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ type ServiceConfig struct {
}

type CPUCostConfig struct {
MaxCpuUtilization float64 `yaml:"max_cpu_utilization"` // maximum allowed CPU utilization when deciding to accept a request. Default to 80%.
MaxCpuUtilization float64 `yaml:"max_cpu_utilization"` // maximum allowed CPU utilization when deciding to accept a request. Default to 80%
MaxMemory int `yaml:"max_memory"` // maximum allowed memory usage in GB. 0 to disable
MaxConcurrentWeb int32 `yaml:"max_concurrent_web"` // maximum allowed chrome/x/pulse instances
RoomCompositeCpuCost float64 `yaml:"room_composite_cpu_cost"`
AudioRoomCompositeCpuCost float64 `yaml:"audio_room_composite_cpu_cost"`
Expand Down
4 changes: 4 additions & 0 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,7 @@ func ErrTrackNotFound(trackID string) error {
func ErrCPUExhausted(usage float64) error {
return psrpc.NewErrorf(psrpc.PermissionDenied, "CPU exhausted: %.2f cores used", usage)
}

func ErrOOM(usage float64) error {
return psrpc.NewErrorf(psrpc.PermissionDenied, "OOM: %.2f GB used", usage)
}
4 changes: 2 additions & 2 deletions pkg/service/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,12 @@ func (pm *ProcessManager) AbortProcess(egressID string, err error) {
}
}

func (pm *ProcessManager) KillProcess(egressID string, maxUsage float64) {
func (pm *ProcessManager) KillProcess(egressID string, err error) {
pm.mu.RLock()
defer pm.mu.RUnlock()

if h, ok := pm.activeHandlers[egressID]; ok {
err := errors.ErrCPUExhausted(maxUsage)

logger.Errorw("killing egress", err, "egressID", egressID)

now := time.Now().UnixNano()
Expand Down
64 changes: 40 additions & 24 deletions pkg/stats/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@ const (
cpuHoldDuration = time.Second * 30
defaultKillThreshold = 0.95
minKillDuration = 10
gb = 1024.0 * 1024.0 * 1024.0
)

type Service interface {
IsIdle() bool
IsDisabled() bool
IsTerminating() bool
KillProcess(string, float64)
KillProcess(string, error)
}

type Monitor struct {
Expand Down Expand Up @@ -384,8 +385,8 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) {
m.mu.Lock()
defer m.mu.Unlock()

maxUsage := 0.0
var maxEgress string
maxCPU := 0.0
var maxCPUEgress string
for pid, cpuUsage := range stats.Cpu {
procStats := m.procStats[pid]
if procStats == nil {
Expand All @@ -399,13 +400,38 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) {
procStats.maxCPU = cpuUsage
}

if cpuUsage > procStats.allowedCPU && cpuUsage > maxUsage {
maxUsage = cpuUsage
maxEgress = procStats.egressID
if cpuUsage > procStats.allowedCPU && cpuUsage > maxCPU {
maxCPU = cpuUsage
maxCPUEgress = procStats.egressID
}
}

cpuKillThreshold := defaultKillThreshold
if cpuKillThreshold <= m.cpuCostConfig.MaxCpuUtilization {
cpuKillThreshold = (1 + m.cpuCostConfig.MaxCpuUtilization) / 2
}

if load > cpuKillThreshold {
logger.Warnw("high cpu usage", nil,
"cpu", load,
"requests", m.requests.Load(),
)

if m.requests.Load() > 1 {
m.highCPUDuration++
if m.highCPUDuration >= minKillDuration {
m.svc.KillProcess(maxCPUEgress, errors.ErrCPUExhausted(maxCPU))
m.highCPUDuration = 0
}
}
}

totalMemory := 0
maxMemory := 0
var maxMemoryEgress string
for pid, memUsage := range stats.Memory {
totalMemory += memUsage

procStats := m.procStats[pid]
if procStats == nil {
continue
Expand All @@ -414,27 +440,17 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) {
if memUsage > procStats.maxMemory {
procStats.maxMemory = memUsage
}
if memUsage > maxMemory {
maxMemory = memUsage
maxMemoryEgress = procStats.egressID
}
}

killThreshold := defaultKillThreshold
if killThreshold <= m.cpuCostConfig.MaxCpuUtilization {
killThreshold = (1 + m.cpuCostConfig.MaxCpuUtilization) / 2
}

if load > killThreshold {
logger.Warnw("high cpu usage", nil,
"load", load,
if m.cpuCostConfig.MaxMemory > 0 && totalMemory > m.cpuCostConfig.MaxMemory*gb {
logger.Warnw("high memory usage", nil,
"memory", float64(totalMemory)/gb,
"requests", m.requests.Load(),
)

if m.requests.Load() > 1 {
m.highCPUDuration++
if m.highCPUDuration < minKillDuration {
return
}
m.svc.KillProcess(maxEgress, maxUsage)
}
m.svc.KillProcess(maxMemoryEgress, errors.ErrOOM(float64(maxMemory)/gb))
}

m.highCPUDuration = 0
}

0 comments on commit 5e2223b

Please sign in to comment.