Skip to content

Commit

Permalink
Merge pull request lightninglabs#110 from guggero/ignore-timeout
Browse files Browse the repository at this point in the history
collectors: don't shut down on timeout on `GetInfo` RPC call
  • Loading branch information
guggero authored Aug 7, 2024
2 parents 892dae7 + e1c2d96 commit 745d3d1
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 14 deletions.
16 changes: 14 additions & 2 deletions collectors/chain_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
resp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
"%v", err)
errWithContext := fmt.Errorf("ChainCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
16 changes: 14 additions & 2 deletions collectors/channels_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
// have open.
getInfoResp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
"with: %v", err)
errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
39 changes: 39 additions & 0 deletions collectors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package collectors

import (
"context"
"strings"

"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

var (
// errRPCDeadlineExceeded is the error that is sent over the gRPC
// interface when it's coming from the server side. The
// status.FromContextError() function won't recognize it correctly
// since the error sent over the wire is a string and not a structured
// error anymore.
errRPCDeadlineExceeded = status.Error(
codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
)
)

// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
// context.DeadlineExceeded error as the cause.
func IsDeadlineExceeded(err error) bool {
if err == nil {
return false
}

st := status.FromContextError(err)
if st.Code() == codes.DeadlineExceeded {
return true
}

if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
return true
}

return false
}
28 changes: 18 additions & 10 deletions collectors/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ type MonitoringConfig struct {
// DisableHtlc disables collection of HTLCs metrics
DisableHtlc bool

// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was started.
// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was
// started.
ProgramStartTime time.Time
}

Expand All @@ -88,13 +89,14 @@ func DefaultConfig() *PrometheusConfig {
// NewPrometheusExporter makes a new instance of the PrometheusExporter given
// the address to listen for Prometheus on and an lnd gRPC client.
func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
monitoringCfg *MonitoringConfig, quitChan chan struct{}) *PrometheusExporter {
monitoringCfg *MonitoringConfig,
quitChan chan struct{}) *PrometheusExporter {

// We have six collectors and a htlc monitor running, so we buffer our
// error channel by 7 so that we do not need to consume all errors from
// error channel by 8 so that we do not need to consume all errors from
// this channel (on the first one, we'll start shutting down, but a few
// could arrive quickly in the case where lnd is shutting down).
errChan := make(chan error, 7)
errChan := make(chan error, 8)

htlcMonitor := newHtlcMonitor(lnd.Router, errChan)

Expand All @@ -116,7 +118,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
}

if !monitoringCfg.DisableGraph {
collectors = append(collectors, NewGraphCollector(lnd.Client, errChan))
collectors = append(
collectors, NewGraphCollector(lnd.Client, errChan),
)
}

return &PrometheusExporter{
Expand Down Expand Up @@ -165,15 +169,19 @@ func (p *PrometheusExporter) Start() error {
// scape our metrics.
go func() {
errorLogger := log.New(
os.Stdout, "promhttp", log.Ldate|log.Ltime|log.Lshortfile,
os.Stdout, "promhttp",
log.Ldate|log.Ltime|log.Lshortfile,
)

promHandler := promhttp.InstrumentMetricHandler(
prometheus.DefaultRegisterer,
promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
ErrorLog: errorLogger,
ErrorHandling: promhttp.ContinueOnError,
}),
promhttp.HandlerFor(
prometheus.DefaultGatherer,
promhttp.HandlerOpts{
ErrorLog: errorLogger,
ErrorHandling: promhttp.ContinueOnError,
},
),
)
http.Handle("/metrics", promHandler)
Logger.Info(http.ListenAndServe(p.cfg.ListenAddr, nil))
Expand Down

0 comments on commit 745d3d1

Please sign in to comment.