Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add stolon-proxy metrics #823

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions cmd/proxy/cmd/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2021 Sorint.lab
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"github.com/prometheus/client_golang/prometheus"
)

var (
proxyHealthGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "stolon_proxy_health",
Help: "Set to 1 if proxy healthy and accepting connections",
},
)

clusterdataLastValidUpdateSeconds = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "stolon_proxy_clusterdata_last_valid_update_seconds",
Help: "Last time we received a valid clusterdata from our store as seconds since unix epoch",
},
)

proxyListenerStartedSeconds = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "stolon_proxy_listener_started_seconds",
Help: "Last time we started the proxy listener as seconds since unix epoch",
},
)

getClusterInfoErrors = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "stolon_proxy_get_cluster_info_errors",
Help: "Count of failed getting and parsing cluster info operationss",
},
)

updateProxyInfoErrors = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "stolon_proxy_update_proxy_info_errors",
Help: "Count of update proxyInfo failures",
},
)
)

func init() {
prometheus.MustRegister(proxyHealthGauge)
prometheus.MustRegister(clusterdataLastValidUpdateSeconds)
prometheus.MustRegister(proxyListenerStartedSeconds)
prometheus.MustRegister(getClusterInfoErrors)
prometheus.MustRegister(updateProxyInfoErrors)
}
14 changes: 14 additions & 0 deletions cmd/proxy/cmd/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ func (c *ClusterChecker) startPollonProxy() error {
c.endPollonProxyCh <- c.pp.Start()
}()

proxyHealthGauge.Set(1)
proxyListenerStartedSeconds.SetToCurrentTime()
return nil
}

Expand All @@ -160,6 +162,7 @@ func (c *ClusterChecker) stopPollonProxy() {
c.pp = nil
c.listener.Close()
c.listener = nil
proxyHealthGauge.Set(0)
}
}

Expand Down Expand Up @@ -190,6 +193,7 @@ func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTime
func (c *ClusterChecker) Check() error {
cd, _, err := c.e.GetClusterData(context.TODO())
if err != nil {
getClusterInfoErrors.Inc()
return fmt.Errorf("cannot get cluster data: %v", err)
}

Expand All @@ -206,13 +210,20 @@ func (c *ClusterChecker) Check() error {
}
if cd.FormatVersion != cluster.CurrentCDFormatVersion {
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
getClusterInfoErrors.Inc()
return fmt.Errorf("unsupported clusterdata format version: %d", cd.FormatVersion)
}
if err = cd.Cluster.Spec.Validate(); err != nil {
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
getClusterInfoErrors.Inc()
return fmt.Errorf("clusterdata validation failed: %v", err)
}

// Mark that the clusterdata we've received is valid. We'll use this metric to detect
// when our store is failing to serve a valid clusterdata, so it's important we only
// update the metric here.
clusterdataLastValidUpdateSeconds.SetToCurrentTime()

cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration
cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration

Expand All @@ -231,6 +242,7 @@ func (c *ClusterChecker) Check() error {
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil {
updateProxyInfoErrors.Inc()
log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
Expand All @@ -248,6 +260,7 @@ func (c *ClusterChecker) Check() error {
c.sendPollonConfData(pollon.ConfData{DestAddr: nil})
// ignore errors on setting proxy info
if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil {
updateProxyInfoErrors.Inc()
log.Errorw("failed to update proxyInfo", zap.Error(err))
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
Expand All @@ -271,6 +284,7 @@ func (c *ClusterChecker) Check() error {
// cannot ignore this error since the sentinel won't know that we exist
// and are sending connections to a master so, when electing a new
// master, it'll not wait for us to close connections to the old one.
updateProxyInfoErrors.Inc()
return fmt.Errorf("failed to update proxyInfo: %v", err)
} else {
// update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info
Expand Down