From 88ef9710673e8a6d49de8d643caf37e4812a7eca Mon Sep 17 00:00:00 2001 From: David Colburn Date: Thu, 12 Dec 2024 11:53:29 -0800 Subject: [PATCH] kill highest memory egress when OOM --- pkg/config/service.go | 3 +- pkg/errors/errors.go | 4 +++ pkg/service/process.go | 4 +-- pkg/stats/monitor.go | 64 ++++++++++++++++++++++++++---------------- 4 files changed, 48 insertions(+), 27 deletions(-) diff --git a/pkg/config/service.go b/pkg/config/service.go index 4e7c21de..0982202d 100644 --- a/pkg/config/service.go +++ b/pkg/config/service.go @@ -57,7 +57,8 @@ type ServiceConfig struct { } type CPUCostConfig struct { - MaxCpuUtilization float64 `yaml:"max_cpu_utilization"` // maximum allowed CPU utilization when deciding to accept a request. Default to 80%. + MaxCpuUtilization float64 `yaml:"max_cpu_utilization"` // maximum allowed CPU utilization when deciding to accept a request. Default to 80% + MaxMemory int `yaml:"max_memory"` // maximum allowed memory usage in GB. 0 to disable MaxConcurrentWeb int32 `yaml:"max_concurrent_web"` // maximum allowed chrome/x/pulse instances RoomCompositeCpuCost float64 `yaml:"room_composite_cpu_cost"` AudioRoomCompositeCpuCost float64 `yaml:"audio_room_composite_cpu_cost"` diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index a1158b0e..324727e4 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -152,3 +152,7 @@ func ErrTrackNotFound(trackID string) error { func ErrCPUExhausted(usage float64) error { return psrpc.NewErrorf(psrpc.PermissionDenied, "CPU exhausted: %.2f cores used", usage) } + +func ErrOOM(usage float64) error { + return psrpc.NewErrorf(psrpc.PermissionDenied, "OOM: %.2f GB used", usage) +} diff --git a/pkg/service/process.go b/pkg/service/process.go index 9563f99f..023b7599 100644 --- a/pkg/service/process.go +++ b/pkg/service/process.go @@ -193,12 +193,12 @@ func (pm *ProcessManager) AbortProcess(egressID string, err error) { } } -func (pm *ProcessManager) KillProcess(egressID string, maxUsage float64) { +func (pm *ProcessManager) KillProcess(egressID string, err error) { pm.mu.RLock() defer pm.mu.RUnlock() if h, ok := pm.activeHandlers[egressID]; ok { - err := errors.ErrCPUExhausted(maxUsage) + logger.Errorw("killing egress", err, "egressID", egressID) now := time.Now().UnixNano() diff --git a/pkg/stats/monitor.go b/pkg/stats/monitor.go index 5ad81ac3..41fc1ef9 100644 --- a/pkg/stats/monitor.go +++ b/pkg/stats/monitor.go @@ -35,13 +35,14 @@ const ( cpuHoldDuration = time.Second * 30 defaultKillThreshold = 0.95 minKillDuration = 10 + gb = 1024.0 * 1024.0 * 1024.0 ) type Service interface { IsIdle() bool IsDisabled() bool IsTerminating() bool - KillProcess(string, float64) + KillProcess(string, error) } type Monitor struct { @@ -384,8 +385,8 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) { m.mu.Lock() defer m.mu.Unlock() - maxUsage := 0.0 - var maxEgress string + maxCPU := 0.0 + var maxCPUEgress string for pid, cpuUsage := range stats.Cpu { procStats := m.procStats[pid] if procStats == nil { @@ -399,13 +400,38 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) { procStats.maxCPU = cpuUsage } - if cpuUsage > procStats.allowedCPU && cpuUsage > maxUsage { - maxUsage = cpuUsage - maxEgress = procStats.egressID + if cpuUsage > procStats.allowedCPU && cpuUsage > maxCPU { + maxCPU = cpuUsage + maxCPUEgress = procStats.egressID } } + cpuKillThreshold := defaultKillThreshold + if cpuKillThreshold <= m.cpuCostConfig.MaxCpuUtilization { + cpuKillThreshold = (1 + m.cpuCostConfig.MaxCpuUtilization) / 2 + } + + if load > cpuKillThreshold { + logger.Warnw("high cpu usage", nil, + "cpu", load, + "requests", m.requests.Load(), + ) + + if m.requests.Load() > 1 { + m.highCPUDuration++ + if m.highCPUDuration >= minKillDuration { + m.svc.KillProcess(maxCPUEgress, errors.ErrCPUExhausted(maxCPU)) + m.highCPUDuration = 0 + } + } + } + + totalMemory := 0 + maxMemory := 0 + var maxMemoryEgress string for pid, memUsage := range stats.Memory { + totalMemory += memUsage + procStats := m.procStats[pid] if procStats == nil { continue @@ -414,27 +440,17 @@ func (m *Monitor) updateEgressStats(stats *hwstats.ProcStats) { if memUsage > procStats.maxMemory { procStats.maxMemory = memUsage } + if memUsage > maxMemory { + maxMemory = memUsage + maxMemoryEgress = procStats.egressID + } } - killThreshold := defaultKillThreshold - if killThreshold <= m.cpuCostConfig.MaxCpuUtilization { - killThreshold = (1 + m.cpuCostConfig.MaxCpuUtilization) / 2 - } - - if load > killThreshold { - logger.Warnw("high cpu usage", nil, - "load", load, + if m.cpuCostConfig.MaxMemory > 0 && totalMemory > m.cpuCostConfig.MaxMemory*gb { + logger.Warnw("high memory usage", nil, + "memory", float64(totalMemory)/gb, "requests", m.requests.Load(), ) - - if m.requests.Load() > 1 { - m.highCPUDuration++ - if m.highCPUDuration < minKillDuration { - return - } - m.svc.KillProcess(maxEgress, maxUsage) - } + m.svc.KillProcess(maxMemoryEgress, errors.ErrOOM(float64(maxMemory)/gb)) } - - m.highCPUDuration = 0 }