Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/lag histogram #1203

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions api/v1/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,6 @@ func (summary ScrapeSummary) HasUpdates() bool {

}

func (summary *ScrapeSummary) UpdateLag(m map[string]float64) {
for k, v := range m {
summary.AddLag(k, v)
}
}

func (summary ConfigTypeScrapeSummary) String() string {
s := []string{}

Expand Down Expand Up @@ -255,12 +249,6 @@ func (t *ScrapeSummary) AddWarning(configType, warning string) {
(*t)[configType] = v
}

func (t *ScrapeSummary) AddLag(key string, lag float64) {
v := (*t)[key]
v.Lag = lag
(*t)[key] = v
}

type ChangeSummary struct {
Orphaned map[string]int `json:"orphaned,omitempty"`
Ignored map[string]int `json:"ignored,omitempty"`
Expand Down Expand Up @@ -332,7 +320,6 @@ type ConfigTypeScrapeSummary struct {
Unchanged int `json:"unchanged,omitempty"`
Change *ChangeSummary `json:"change,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Lag float64 `json:"lag,omitempty"`
}

// +kubebuilder:object:generate=false
Expand Down
42 changes: 0 additions & 42 deletions db/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -719,51 +719,9 @@ func saveResults(ctx api.ScrapeContext, results []v1.ScrapeResult) (v1.ScrapeSum
ctx.Logger.V(4).Infof("No Update: %s", summary)
}

// Add lag time in summary
if lagMap, err := calculateScrapeLag(ctx); err != nil {
ctx.Errorf("error calculating lag: %v", err)
} else {
summary.UpdateLag(lagMap)
}
return summary, nil
}

func calculateScrapeLag(ctx api.ScrapeContext) (map[string]float64, error) {
if ctx.ScraperID() == "" {
return nil, nil
}

q := `
SELECT COALESCE(ROUND(EXTRACT(EPOCH FROM (MAX(inserted_at - created_at))), 2), 0) AS lag
FROM config_items
WHERE
scraper_id = ? AND
inserted_at >= last_scraped_time
`
var configItemLag float64
if err := ctx.DB().Raw(q, ctx.ScraperID()).Scan(&configItemLag).Error; err != nil {
return nil, fmt.Errorf("error querying config_item scrape lag: %w", dutydb.ErrorDetails(err))
}

q = `
SELECT COALESCE(ROUND(EXTRACT(EPOCH FROM (MAX(cc.inserted_at - cc.created_at))), 2), 0) AS lag
FROM config_changes cc
INNER JOIN config_items ci ON cc.config_id = ci.id
WHERE
ci.scraper_id = ? AND
cc.inserted_at >= ci.last_scraped_time
`
var configChangeLag float64
if err := ctx.DB().Raw(q, ctx.ScraperID()).Scan(&configChangeLag).Error; err != nil {
return nil, fmt.Errorf("error querying config_change scrape lag: %w", dutydb.ErrorDetails(err))
}

return map[string]float64{
"config_item_lag_seconds": configItemLag,
"config_change_lag_seconds": configChangeLag,
}, nil
}

var lastScrapedTimeMutex sync.Map

func updateLastScrapedTime(ctx api.ScrapeContext, ids []string) error {
Expand Down
7 changes: 0 additions & 7 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,8 @@ require (
github.com/asecurityteam/rolling v2.0.4+incompatible // indirect
github.com/aws/aws-sdk-go-v2/config v1.27.29 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.17.29 // indirect
github.com/bahlo/generic-list-go v0.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bmatcuk/doublestar/v4 v4.7.1 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cert-manager/cert-manager v1.9.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
Expand Down Expand Up @@ -159,7 +157,6 @@ require (
github.com/hashicorp/hcl/v2 v2.21.0 // indirect
github.com/henvic/httpretty v0.1.3 // indirect
github.com/hirochachacha/go-smb2 v1.1.0 // indirect
github.com/invopop/jsonschema v0.12.0 // indirect
github.com/itchyny/gojq v0.12.16 // indirect
github.com/itchyny/timefmt-go v0.1.6 // indirect
github.com/jackc/pgerrcode v0.0.0-20240316143900-6e2875d9b438 // indirect
Expand Down Expand Up @@ -204,11 +201,7 @@ require (
github.com/vadimi/go-http-ntlm v1.0.3 // indirect
github.com/vadimi/go-http-ntlm/v2 v2.4.1 // indirect
github.com/vadimi/go-ntlm v1.2.1 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
github.com/xanzy/ssh-agent v0.3.3 // indirect
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
github.com/yuin/gopher-lua v1.1.1 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
Expand Down
14 changes: 0 additions & 14 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -833,8 +833,6 @@ github.com/aws/aws-sdk-go-v2/service/support v1.24.3 h1:Bbesu6YZvEYACyydELMwUTYY
github.com/aws/aws-sdk-go-v2/service/support v1.24.3/go.mod h1:NvXUhACskXZ2tiXzECpC/97xKzyY7/Wcc1ug5rla7kY=
github.com/aws/smithy-go v1.20.4 h1:2HK1zBdPgRbjFOHlfeQZfpC4r72MOb9bZkiFwggKO+4=
github.com/aws/smithy-go v1.20.4/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
Expand All @@ -851,8 +849,6 @@ github.com/bmatcuk/doublestar/v4 v4.7.1 h1:fdDeAqgT47acgwd9bd9HxJRDmc9UAmPpc+2m0
github.com/bmatcuk/doublestar/v4 v4.7.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
github.com/bwesterb/go-ristretto v1.2.3/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0=
github.com/cactus/go-statsd-client/statsd v0.0.0-20200423205355-cb0885a1018c/go.mod h1:l/bIBLeOl9eX+wxJAzxS4TveKRtAqlyDpHjhkfO0MEI=
github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM=
Expand Down Expand Up @@ -1309,8 +1305,6 @@ github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+h
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/invopop/jsonschema v0.12.0 h1:6ovsNSuvn9wEQVOyc72aycBMVQFKz7cPdMJn10CvzRI=
github.com/invopop/jsonschema v0.12.0/go.mod h1:ffZ5Km5SWWRAIN6wbDXItl95euhFz2uON45H2qjYt+0=
github.com/itchyny/gojq v0.12.13/go.mod h1:JzwzAqenfhrPUuwbmEz3nu3JQmFLlQTQMUcOdnu/Sf4=
github.com/itchyny/gojq v0.12.16 h1:yLfgLxhIr/6sJNVmYfQjTIv0jGctu6/DgDoivmxTr7g=
github.com/itchyny/gojq v0.12.16/go.mod h1:6abHbdC2uB9ogMS38XsErnfqJ94UlngIJGlRAIj4jTM=
Expand Down Expand Up @@ -1754,16 +1748,8 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM=
github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f h1:J9EGpcZtP0E/raorCMxlFGSTBrsSlaDGf3jU/qvAE2c=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
github.com/xhit/go-str2duration v1.2.0/go.mod h1:3cPSlfZlUHVlneIVfePFWcJZsuwf+P1v2SRTV4cUmp4=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
Expand Down
57 changes: 45 additions & 12 deletions scrapers/cron.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,9 @@ func ConsumeKubernetesWatchJobFunc(sc api.ScrapeContext, config v1.Kubernetes) *
var events []v1.KubernetesEvent
var objs []*unstructured.Unstructured
var count int

var firstObject, firstEvent *kubernetes.QueueItem

for {
val, more := queue.Dequeue()
if !more {
Expand All @@ -327,13 +330,21 @@ func ConsumeKubernetesWatchJobFunc(sc api.ScrapeContext, config v1.Kubernetes) *
break
}

switch v := val.(type) {
case v1.KubernetesEvent:
events = append(events, v)
case *unstructured.Unstructured:
objs = append(objs, v)
default:
return fmt.Errorf("unexpected data in the queue: %T", v)
queueItem, ok := val.(*kubernetes.QueueItem)
if !ok {
return fmt.Errorf("unexpected item in the priority queue: %T", val)
}

if queueItem.Event != nil {
events = append(events, *queueItem.Event)
if firstEvent == nil {
firstEvent = queueItem
}
} else if queueItem.Obj != nil {
objs = append(objs, queueItem.Obj)
if firstObject == nil {
firstObject = queueItem
}
}
}

Expand All @@ -346,13 +357,35 @@ func ConsumeKubernetesWatchJobFunc(sc api.ScrapeContext, config v1.Kubernetes) *
// In the process, we will lose diff changes though.
// If diff changes are necessary, then we can split up the results in such
// a way that no two objects in a batch have the same id.
objs = dedup(objs)
if err := consumeResources(ctx, scrapeConfig, config, objs); err != nil {
ctx.History.AddErrorf("failed to consume resources: %v", err)
return err

scrapeStart := time.Now()

if len(objs) > 0 {
objs = dedup(objs)
if err := consumeResources(ctx, scrapeConfig, config, objs); err != nil {
ctx.History.AddErrorf("failed to consume resources: %v", err)
return err
}

// For now, measure a single lag for the entire batch instead of per config item
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It needs to be per per config item, at the tail end is what is going to hurt us -

lag := time.Since(firstObject.Timestamp)
ctx.Histogram("informer_consume_lag", []float64{1000, 10_000, 20_000, 50_000, 100_000, 200_000, 500_000}, "scraper", sc.ScraperID()).
Record(time.Duration(lag.Milliseconds()))
}

return consumeWatchEvents(ctx, scrapeConfig, config, events)
if len(events) > 0 {
scrapeDuration := time.Since(scrapeStart)
if err := consumeWatchEvents(ctx, scrapeConfig, config, events); err != nil {
ctx.History.AddErrorf("failed to consume events: %v", err)
return err
}

eventConsumeLag := time.Since(firstEvent.Timestamp) - scrapeDuration
ctx.Histogram("events_consume_lag", []float64{1000, 10_000, 20_000, 50_000, 100_000, 200_000, 500_000}, "scraper", sc.ScraperID()).
Record(time.Duration(eventConsumeLag.Milliseconds()))
}

return nil
},
}
}
Expand Down
53 changes: 39 additions & 14 deletions scrapers/kubernetes/events_watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,45 @@ func getUnstructuredFromInformedObj(resource v1.KubernetesResourceToWatch, obj a
return &unstructured.Unstructured{Object: m}, nil
}

func pqComparator(a, b any) int {
var aTimestamp, bTimestamp time.Time
type QueueItem struct {
Timestamp time.Time
Event *v1.KubernetesEvent
Obj *unstructured.Unstructured
}

func NewEventQueueItem(e *v1.KubernetesEvent) *QueueItem {
return &QueueItem{
Timestamp: time.Now(),
Event: e,
}
}

switch v := a.(type) {
case v1.KubernetesEvent:
aTimestamp = v.Metadata.GetCreationTimestamp().Time
case *unstructured.Unstructured:
aTimestamp = v.GetCreationTimestamp().Time
func NewObjectQueueItem(e *unstructured.Unstructured) *QueueItem {
return &QueueItem{
Timestamp: time.Now(),
Obj: e,
}
}

switch v := b.(type) {
case v1.KubernetesEvent:
bTimestamp = v.Metadata.GetCreationTimestamp().Time
case *unstructured.Unstructured:
bTimestamp = v.GetCreationTimestamp().Time
// PayloadTimestamp returns the timestamp of the event or the object it contains.
func (t *QueueItem) PayloadTimestamp() time.Time {
if t.Event != nil {
return t.Event.Metadata.CreationTimestamp.Time
}

if aTimestamp.Before(bTimestamp) {
if t.Obj != nil {
return t.Obj.GetCreationTimestamp().Time
}

panic("queue item is empty")
}

func pqComparator(a, b any) int {
var aTimestamp, bTimestamp time.Time
qa := a.(*QueueItem)
qb := b.(*QueueItem)

if qa.PayloadTimestamp().Before(qb.PayloadTimestamp()) {
return -1
} else if aTimestamp.Equal(bTimestamp) {
return 0
Expand Down Expand Up @@ -153,6 +174,10 @@ func WatchEvents(ctx api.ScrapeContext, config v1.Kubernetes) error {
continue
}

// TODO: We receive old events (hours old) and that screws up the histogram
ctx.Histogram("event_receive_lag", []float64{1, 100, 1000, 10_000, 100_000}, "scraper", ctx.ScraperID()).
Record(time.Duration(time.Since(event.Metadata.CreationTimestamp.Time).Milliseconds()))

if event.InvolvedObject == nil {
ctx.Counter("kubernetes_scraper_unmatched", "source", "watch", "reason", "involved_object_nil", "scraper_id", ctx.ScraperID()).Add(1)
continue
Expand All @@ -168,7 +193,7 @@ func WatchEvents(ctx api.ScrapeContext, config v1.Kubernetes) error {
}
ctx.Counter("kubernetes_scraper_events", "source", "watch", "kind", event.InvolvedObject.Kind, "scraper_id", ctx.ScraperID()).Add(1)

priorityQueue.Enqueue(event)
priorityQueue.Enqueue(NewEventQueueItem(&event))
}

return nil
Expand Down
21 changes: 19 additions & 2 deletions scrapers/kubernetes/informers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"strings"
"sync"
"time"

pq "github.com/emirpasic/gods/queues/priorityqueue"
"github.com/flanksource/commons/logger"
Expand Down Expand Up @@ -70,7 +71,15 @@ func (t *SharedInformerManager) Register(ctx api.ScrapeContext, watchResource v1
if ctx.Properties().On(false, "scraper.log.items") {
ctx.Logger.V(4).Infof("added: %s %s %s", u.GetUID(), u.GetKind(), u.GetName())
}
queue.Enqueue(u)

// TODO: We receive very old objects (months old) and that screws up the histogram
moshloop marked this conversation as resolved.
Show resolved Hide resolved
ctx.Histogram("informer_receive_lag", []float64{1, 100, 1000, 10_000, 100_000},
"scraper", ctx.ScraperID(),
"kind", watchResource.Kind,
"operation", "add",
).Record(time.Duration(time.Since(u.GetCreationTimestamp().Time).Milliseconds()))

queue.Enqueue(NewObjectQueueItem(u))
},
UpdateFunc: func(oldObj any, newObj any) {
u, err := getUnstructuredFromInformedObj(watchResource, newObj)
Expand All @@ -89,7 +98,8 @@ func (t *SharedInformerManager) Register(ctx api.ScrapeContext, watchResource v1
if ctx.Properties().On(false, "scraper.log.items") {
ctx.Logger.V(3).Infof("updated: %s %s %s", u.GetUID(), u.GetKind(), u.GetName())
}
queue.Enqueue(u)

queue.Enqueue(NewObjectQueueItem(u))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need update lag - this is actually the most critical one - I have added an is-healthy LastUpdated func https://github.com/flanksource/is-healthy/pull/144/files#diff-b7f1feaa038882966a9d30e0b96e476dccccc2608dd8a660f80767dbbc635b6fR183

},
DeleteFunc: func(obj any) {
u, err := getUnstructuredFromInformedObj(watchResource, obj)
Expand All @@ -107,6 +117,13 @@ func (t *SharedInformerManager) Register(ctx api.ScrapeContext, watchResource v1
if ctx.Properties().On(false, "scraper.log.items") {
ctx.Logger.V(3).Infof("deleted: %s %s %s", u.GetUID(), u.GetKind(), u.GetName())
}

ctx.Histogram("informer_receive_lag", []float64{1, 100, 1000, 10_000, 100_000},
"scraper", ctx.ScraperID(),
"kind", watchResource.Kind,
"operation", "delete",
).Record(time.Duration(time.Since(u.GetDeletionTimestamp().Time).Milliseconds()))

deleteBuffer <- string(u.GetUID())
},
})
Expand Down
Loading