Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

process monitoring #529

Merged
merged 8 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ require (
github.com/livekit/psrpc v0.5.2
github.com/mackerelio/go-osstat v0.2.4
github.com/maxbrunsfeld/counterfeiter/v6 v6.7.0
github.com/mitchellh/go-ps v1.0.0
github.com/pion/logging v0.2.2
github.com/pion/sdp/v3 v3.0.6
github.com/pion/webrtc/v3 v3.2.23
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.17.0
github.com/prometheus/procfs v0.11.1
github.com/redis/go-redis/v9 v9.3.0
github.com/stretchr/testify v1.8.4
github.com/twitchtv/twirp v8.1.3+incompatible
Expand Down Expand Up @@ -64,7 +66,6 @@ require (
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/rogpeppe/go-internal v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.15.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zk
github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/maxbrunsfeld/counterfeiter/v6 v6.7.0 h1:z0CfPybq3CxaJvrrpf7Gme1psZTqHhJxf83q6apkSpI=
github.com/maxbrunsfeld/counterfeiter/v6 v6.7.0/go.mod h1:RVP6/F85JyxTrbJxWIdKU2vlSvK48iCMnMXRkSz7xtg=
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E=
github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8=
github.com/nats-io/nkeys v0.4.6 h1:IzVe95ru2CT6ta874rt9saQRkWfe2nFj1NtvYSLqMzY=
Expand Down
432 changes: 262 additions & 170 deletions rpc/io.pb.go

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions rpc/io.proto
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ service IOInfo {
rpc UpdateEgress(livekit.EgressInfo) returns (google.protobuf.Empty);
rpc GetEgress(GetEgressRequest) returns (livekit.EgressInfo);
rpc ListEgress(livekit.ListEgressRequest) returns (livekit.ListEgressResponse);
rpc UpdateMetrics(UpdateMetricsRequest) returns (google.protobuf.Empty);

// ingress
rpc GetIngressInfo(GetIngressInfoRequest) returns (GetIngressInfoResponse);
Expand All @@ -42,6 +43,12 @@ message GetEgressRequest {
string egress_id = 1;
}

message UpdateMetricsRequest {
livekit.EgressInfo info = 1;
float avg_cpu_usage = 3;
float max_cpu_usage = 4;
}

// Query an ingress info from an ingress ID or stream key
message GetIngressInfoRequest {
string ingress_id = 1;
Expand Down
117 changes: 69 additions & 48 deletions rpc/io.psrpc.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

134 changes: 129 additions & 5 deletions utils/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"time"

"github.com/frostbyte73/core"
"github.com/mitchellh/go-ps"
"github.com/prometheus/procfs"
"go.uber.org/atomic"

"github.com/livekit/protocol/logger"
Expand All @@ -35,12 +37,13 @@ type CPUStats struct {
idleCPUs atomic.Float64
platform platformCPUMonitor

updateCallback func(idle float64)
idleCallback func(idle float64)
procCallback func(idle float64, usage map[int]float64)
warningThrottle core.Throttle
closeChan chan struct{}
}

func NewCPUStats(updateCallback func(idle float64)) (*CPUStats, error) {
func NewCPUStats(idleUpdateCallback func(idle float64)) (*CPUStats, error) {
p, err := newPlatformCPUMonitor()
if err != nil {
return nil, err
Expand All @@ -49,7 +52,7 @@ func NewCPUStats(updateCallback func(idle float64)) (*CPUStats, error) {
c := &CPUStats{
platform: p,
warningThrottle: core.NewThrottle(time.Minute),
updateCallback: updateCallback,
idleCallback: idleUpdateCallback,
closeChan: make(chan struct{}),
}

Expand All @@ -58,6 +61,24 @@ func NewCPUStats(updateCallback func(idle float64)) (*CPUStats, error) {
return c, nil
}

func NewProcCPUStats(procUpdateCallback func(idle float64, usage map[int]float64)) (*CPUStats, error) {
p, err := newPlatformCPUMonitor()
if err != nil {
return nil, err
}

c := &CPUStats{
platform: p,
warningThrottle: core.NewThrottle(time.Minute),
procCallback: procUpdateCallback,
closeChan: make(chan struct{}),
}

go c.monitorProcCPULoad()

return c, nil
}

func (c *CPUStats) GetCPUIdle() float64 {
return c.idleCPUs.Load()
}
Expand Down Expand Up @@ -92,9 +113,112 @@ func (c *CPUStats) monitorCPULoad() {
c.warningThrottle(func() { logger.Infow("high cpu load", "load", 1-idleRatio) })
}

if c.updateCallback != nil {
c.updateCallback(idle)
if c.idleCallback != nil {
c.idleCallback(idle)
}
}
}
}

func (c *CPUStats) monitorProcCPULoad() {
numCPU := c.platform.numCPU()

fs, err := procfs.NewFS(procfs.DefaultMountPoint)
if err != nil {
logger.Errorw("failed read proc fs", err)
return
}
hostCPU, err := getHostCPUCount(fs)
if err != nil {
logger.Errorw("failed to read pod cpu count", err)
return
}

self, err := fs.Self()
if err != nil {
logger.Errorw("failed to read self", err)
return
}

ticker := time.NewTicker(time.Second)
defer ticker.Stop()

var prevTotalTime float64
var prevStats map[int]procfs.ProcStat
for {
select {
case <-c.closeChan:
return
case <-ticker.C:
procStats := make(map[int]procfs.ProcStat)
procs, err := procfs.AllProcs()
if err != nil {
logger.Errorw("failed to read processes", err)
continue
}

total, err := fs.Stat()
frostbyte73 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
logger.Errorw("failed to read stats", err)
continue
}

ppids := make(map[int]int)
for _, proc := range procs {
procStats[proc.PID], err = proc.Stat()
if err != nil {
logger.Errorw("failed to read proc stats", err)
continue
}
if proc.PID != self.PID {
ppids[proc.PID], err = getPPID(proc.PID)
if err != nil {
logger.Errorw("failed to get PPID", err)
continue
}
}
}

totalHostTime := total.CPUTotal.Idle + total.CPUTotal.Iowait +
total.CPUTotal.User + total.CPUTotal.Nice + total.CPUTotal.System +
total.CPUTotal.IRQ + total.CPUTotal.SoftIRQ + total.CPUTotal.Steal

usage := make(map[int]float64)
podUsage := 0.0
for pid, stat := range procStats {
// process usage as percent of total host cpu
procPercentUsage := float64(stat.UTime + stat.STime - prevStats[pid].UTime - prevStats[pid].STime)
if procPercentUsage == 0 {
continue
}

for ppids[pid] != self.PID && ppids[pid] != 0 {
// bundle usage up to first child of main go process
pid = ppids[pid]
}

procUsage := hostCPU * procPercentUsage / 100 / (totalHostTime - prevTotalTime)
usage[pid] += procUsage
podUsage += procUsage
}

idle := numCPU - podUsage
c.idleCPUs.Store(idle)

if c.procCallback != nil {
c.procCallback(idle, usage)
}

prevTotalTime = totalHostTime
prevStats = procStats
}
}
}

func getPPID(pid int) (int, error) {
p, err := ps.FindProcess(pid)
if err != nil {
return 0, err
}
return p.PPid(), nil
}
Loading