Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PDR-16694][bug] kafka metric监控采集采集频率异常 #1211

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions mgr/metric_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ const (
)

const (
defaultCollectInterval = 30
defaultCollectInterval = 30
defaultTimeoutCountToReset = 2
)

type MetricConfig struct {
Expand All @@ -54,6 +55,7 @@ type MetricRunner struct {

startedWG *sync.WaitGroup
collectors []metric.Collector
timeoutCount map[string]int
senders []sender.Sender
transformers map[string][]transforms.Transformer
commonTrans []transforms.Transformer
Expand Down Expand Up @@ -104,6 +106,7 @@ func NewMetricRunner(rc RunnerConfig, wg *sync.WaitGroup, sr *sender.Registry) (
rc.SendersConfig[i][KeyRunnerName] = rc.RunnerName
}
collectors := make([]metric.Collector, 0)
timeoutCount := make(map[string]int)
transformers := make(map[string][]transforms.Transformer)

for _, m := range rc.MetricConfig {
Expand Down Expand Up @@ -132,6 +135,7 @@ func NewMetricRunner(rc RunnerConfig, wg *sync.WaitGroup, sr *sender.Registry) (
}

collectors = append(collectors, c)
timeoutCount[c.Name()] = 0

// 配置文件中明确标明 false 的 attr 加入 discard transformer
config := c.Config()
Expand Down Expand Up @@ -238,6 +242,7 @@ func NewMetricRunner(rc RunnerConfig, wg *sync.WaitGroup, sr *sender.Registry) (
rsMutex: new(sync.RWMutex),
collectInterval: interval,
collectors: collectors,
timeoutCount: timeoutCount,
transformers: transformers,
commonTrans: commonTransformers,
senders: senders,
Expand Down Expand Up @@ -301,8 +306,9 @@ func (r *MetricRunner) Run() {
dataCnt := 0
datas := make([]Data, 0)
metricTime := time.Now()
tags[metric.Timestamp] = metricTime.Format(time.RFC3339Nano)
tags[metric.Timestamp] = metricTime.UnixNano() / 1e6
for _, c := range r.collectors {
before := time.Now()
metricName := c.Name()
tmpdatas, err := c.Collect()
if err != nil {
Expand Down Expand Up @@ -352,6 +358,23 @@ func (r *MetricRunner) Run() {
datas = append(datas, data)
dataCnt++
}

// 处理读取超时
if time.Now().Sub(before) > r.collectInterval {
log.Warnf("collecter <%v> exec timeout %d seconds", time.Now().Sub(before).Seconds())
r.timeoutCount[metricName]++
if reset, ok := c.(Resetable); ok && r.timeoutCount[metricName] >= defaultTimeoutCountToReset {
if err = reset.Reset(); err != nil {
log.Errorf("collecter <%v> reset fail: %v", metricTime, err)
continue
} else {
log.Infof("collecter <%v> reset success", metricTime)
r.timeoutCount[metricName] = 0
}
}
} else {
r.timeoutCount[metricName] = 0
}
}

if r.isBlock {
Expand Down