-
Notifications
You must be signed in to change notification settings - Fork 157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Export ActionSet Metrics for Kanister #1603
Changes from all commits
f25e3e7
7ecc7b2
0f0a899
25c4f23
4fcb1f4
69b2dea
796c462
424ff9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
package metrics | ||
|
||
// This file contains wrapper functions that will map a Prometheus metric names to its | ||
// label field, help field and an associated Event. | ||
|
||
// MetricType will represent a Prometheus metric. | ||
// A variable of type MetricType will hold the name of the Prometheus metric as reported. | ||
type MetricType string | ||
|
||
const ( | ||
ActionSetCreatedTotalType MetricType = "kanister_actionset_created_total" | ||
ActionSetCompletedTotalType MetricType = "kanister_actionset_completed_total" | ||
ActionSetFailedTotalType MetricType = "kanister_actionset_failed_total" | ||
) | ||
|
||
// MetricTypeOpt is a struct for a Prometheus metric. | ||
// Help and LabelNames are passed directly to the Prometheus predefined functions. | ||
// EventFunc holds the constructor of the linked Event of a given MetricType. | ||
type MetricTypeOpt struct { | ||
EventFunc interface{} | ||
Help string | ||
LabelNames []string | ||
} | ||
|
||
// Mapping a Prometheus MetricType to the metric MetricTypeOpt struct. | ||
// Basically, a metric name is mapped to its associated Help and LabelName fields. | ||
// The linked event function (EventFunc) is also mapped to this metric name as a part of MetricTypeOpt. | ||
var MetricCounterOpts = map[MetricType]MetricTypeOpt{ | ||
ActionSetCreatedTotalType: { | ||
EventFunc: NewActionSetCreatedTotal, | ||
Help: "The count of total ActionSets created", | ||
LabelNames: []string{"actionType", "namespace"}, | ||
}, | ||
ActionSetCompletedTotalType: { | ||
EventFunc: NewActionSetCompletedTotal, | ||
Help: "The count of total ActionSets completed", | ||
LabelNames: []string{"actionName", "actionType", "blueprint", "namespace", "state"}, | ||
}, | ||
ActionSetFailedTotalType: { | ||
EventFunc: NewActionSetFailedTotal, | ||
Help: "The count of total ActionSets failed", | ||
LabelNames: []string{"actionName", "actionType", "blueprint", "namespace", "state"}, | ||
}, | ||
} | ||
|
||
// Event describes an individual event. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure we are gaining much with this new What do you think of merging type Metric struct {
Name string
Help string
LabelNames []string
}
type Sample struct {
Name string
Labels prometheus.Labels
} Here is a diff with suggestions on what the code can look like with the two types:diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go
index ff5a0a10..94e65aad 100644
--- a/pkg/controller/controller.go
+++ b/pkg/controller/controller.go
@@ -62,7 +62,7 @@ type Controller struct {
config *rest.Config
crClient versioned.Interface
clientset kubernetes.Interface
- counterVecs map[metrics.MetricType]*prometheus.CounterVec
+ counterVecs map[string]*prometheus.CounterVec
dynClient dynamic.Interface
osClient osversioned.Interface
recorder record.EventRecorder
@@ -374,7 +374,8 @@ func (c *Controller) handleActionSet(as *crv1alpha1.ActionSet) (err error) {
// These labels will be passed to a function in the metrics package
// Also, a list of events to update metrics should be created and incremented.
for _, a := range as.Spec.Actions {
- if err := metrics.IncrementCounterVec(metrics.NewActionSetTotalCreated(a.Name, as.GetNamespace())); err != nil {
+ s := metrics.NewSampleActionSetTotal(a.Name, as.GetNamespace())
+ if err := metrics.IncrementCounterVec(s); err != nil {
log.Error().WithError(err).Print("Metrics Incrementation failed")
}
}
diff --git a/pkg/metrics/events.go b/pkg/metrics/events.go
index 81b341d1..65f026c6 100644
--- a/pkg/metrics/events.go
+++ b/pkg/metrics/events.go
@@ -1,5 +1,7 @@
package metrics
+import "github.com/prometheus/client_golang/prometheus"
+
// This file contains wrapper functions that will map a Prometheus metric names to its
// label field, help field and an associated Event.
@@ -8,18 +10,37 @@ package metrics
type MetricType string
const (
- SampleCountType MetricType = "sample_count"
-
- ActionSetBackupCreatedType MetricType = "actionset_backup_created_count"
- ActionSetBackupCompletedType MetricType = "actionset_backup_completed_count"
+ ActionSetBackupCreatedType = "actionset_backup_created_count"
+ ActionSetBackupCompletedType = "actionset_backup_completed_count"
- ActionSetRestoreCreatedType MetricType = "actionset_restore_created_count"
- ActionSetRestoreCompletedType MetricType = "actionset_restore_completed_count"
+ ActionSetRestoreCreatedType = "actionset_restore_created_count"
+ ActionSetRestoreCompletedType = "actionset_restore_completed_count"
- ActionSetTotalCreatedType MetricType = "actionset_total_created_count"
- ActionSetTotalCompletedType MetricType = "actionset_total_completed_count"
+ ActionSetTotalCreatedType = "actionset_total_created_count"
+ ActionSetTotalCompletedType = "actionset_total_completed_count"
)
+type Metric struct {
+ Name string
+ Help string
+ LabelNames []string
+}
+
+type Sample struct {
+ Name string
+ Labels prometheus.Labels
+}
+
+func NewSampleActionSetTotal(action string, namespace string) *Sample {
+ return &Sample{
+ Name: ActionSetTotalCompletedType,
+ Labels: prometheus.Labels{
+ "actionType": action,
+ "namespace": namespace,
+ },
+ }
+}
+
// MetricTypeOpt is a struct for a Prometheus metric.
// Help and LabelNames are passed directly to the Prometheus predefined functions.
// EventFunc holds the constructor of the linked Event of a given MetricType.
@@ -32,40 +53,32 @@ type MetricTypeOpt struct {
// Mapping a Prometheus MetricType to the metric MetricTypeOpt struct.
// Basically, a metric name is mapped to its associated Help and LabelName fields.
// The linked event function (EventFunc) is also mapped to this metric name as a part of MetricTypeOpt.
-var MetricTypeOpts = map[MetricType]MetricTypeOpt{
- SampleCountType: {
- EventFunc: NewSampleCount,
- Help: "Sample counter to remove later",
- LabelNames: []string{"sample"},
- },
-
+var Counters = map[string]*Metric{
ActionSetBackupCreatedType: {
- EventFunc: NewActionSetBackupCreated,
- Help: "The count of backup ActionSets created",
+ Name: ActionSetBackupCreatedType,
+ Help: "The count of backup ActionSets created",
},
ActionSetBackupCompletedType: {
- EventFunc: NewActionSetBackupCompleted,
+ Name: ActionSetBackupCompletedType,
Help: "The count of backup ActionSets completed",
LabelNames: []string{"state"},
},
-
ActionSetRestoreCreatedType: {
- EventFunc: NewActionSetRestoreCreated,
- Help: "The count of restore ActionSets created",
+ Name: ActionSetRestoreCreatedType,
+ Help: "The count of restore ActionSets created",
},
ActionSetRestoreCompletedType: {
- EventFunc: NewActionSetRestoreCompleted,
+ Name: ActionSetRestoreCompletedType,
Help: "The count of restore ActionSets completed",
LabelNames: []string{"state"},
},
-
ActionSetTotalCreatedType: {
- EventFunc: NewActionSetTotalCreated,
+ Name: ActionSetTotalCreatedType,
Help: "The count of total ActionSets created",
LabelNames: []string{"actionType", "namespace"},
},
ActionSetTotalCompletedType: {
- EventFunc: NewActionSetTotalCompleted,
+ Name: ActionSetTotalCompletedType,
Help: "The count of total ActionSets completed",
LabelNames: []string{"actionType", "namespace", "state"},
},
@@ -98,15 +111,6 @@ func (e *Event) Labels() map[string]string {
return labels
}
-func NewSampleCount(sample string) Event {
- return Event{
- eventType: SampleCountType,
- labels: map[<details>
<summary>Click me</summary>string]string{
- "sample": sample,
- },
- }
-}
-
func NewActionSetBackupCreated() Event {
return Event{
eventType: ActionSetBackupCreatedType,
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 17a0bb55..f76c60f9 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -8,25 +8,19 @@ import (
"github.com/prometheus/client_golang/prometheus"
)
-var counterVecs = make(map[MetricType]*prometheus.CounterVec)
+var counterVecs = map[string]*prometheus.CounterVec{}
// Initialize a Prometheus CounterVec for one metric and register it
-func initCounterVec(r prometheus.Registerer, t MetricType) (*prometheus.CounterVec, error) {
- metricTypeOpts, ok := MetricTypeOpts[t]
-
- if !ok {
- panic(fmt.Sprintf("Event type %s is not defined", t))
- }
-
+func initCounterVec(r prometheus.Registerer, m *Metric) (*prometheus.CounterVec, error) {
opts := prometheus.CounterOpts{
- Name: string(t),
- Help: metricTypeOpts.Help,
+ Name: string(m.Name),
+ Help: m.Help,
}
- counterVec := prometheus.NewCounterVec(opts, metricTypeOpts.LabelNames)
+ counterVec := prometheus.NewCounterVec(opts, m.LabelNames)
err := r.Register(counterVec)
if err != nil {
- return nil, fmt.Errorf("%s not registered: %s ", t, err)
+ return nil, fmt.Errorf("%s not registered: %s ", m.Name, err)
}
var alreadyRegisteredErr prometheus.AlreadyRegisteredError
if errors.As(err, &alreadyRegisteredErr) {
@@ -39,26 +33,26 @@ func initCounterVec(r prometheus.Registerer, t MetricType) (*prometheus.CounterV
}
// Initialize all the Counter Vecs and save it in a map
-func InitAllCounterVecs(r prometheus.Registerer) map[MetricType]*prometheus.CounterVec {
- for metricType := range MetricTypeOpts {
- cv, err := initCounterVec(r, metricType)
+func InitAllCounterVecs(r prometheus.Registerer) map[string]*prometheus.CounterVec {
+ for _, counter := range Counters {
+ cv, err := initCounterVec(r, counter)
if err != nil {
log.WithError(err).Print("Failed to register metric %s")
return nil
}
- counterVecs[metricType] = cv
+ counterVecs[counter.Name] = cv
}
return counterVecs
}
// Increment a Counter Vec metric
-func IncrementCounterVec(e Event) error {
- if counterVecs[e.eventType] == nil {
- return fmt.Errorf("%s Event Type not found", e.eventType)
+func IncrementCounterVec(s *Sample) error {
+ if counterVecs[s.Name] == nil {
+ return fmt.Errorf("%s Event Type not found", s)
}
- cv := counterVecs[e.eventType].With(e.labels)
+ cv := counterVecs[s.Name].With(s.Labels)
if cv == nil {
- return fmt.Errorf("%s Labels for %s Event Type not found", e.labels, e.eventType)
+ return fmt.Errorf("%s Labels for %s Event Type not found", s.Labels, s.Name)
}
cv.Inc()
return nil LMKWYT. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like much of this was modeled after the counter metrics I added in K10, so while I can't speak much to this code, I can offer some insights into the K10 design. There were two primary things considered in K10's design:
The Prometheus library has an annoying propensity to panic if a This meant that the developer shouldn't be able to freely associate a metric name with labels (because that was the case where we kept running into trouble and causing panics). So the metric type had a private metric name and set of labels that were instantiated via a constructor. This meant that we only needed to ensure that the constructors worked properly, which we did through a reflection based unit test. As for conceptual models for the metrics, I elected to use "event" over "sample" because inherently the transitionary event between a previous state and a new state (in other words, it's representing the point at which an increase happens). This is distinct from what I think of as a "sample": the measure of the overall state at an instantaneous point in time (not the transition between two states, just the measure of the state). To put a different way, I thought of a "sample" more as reporting the current value (i.e. a gauge), whereas "events" were the accumulation of individual changes (i.e. a counter). So if we're going to eschew the type protections, we should recognize what the consequences might be and we should decide if the terminology aligns with conceptual models (personally, "sample" doesn't seem to make as much sense to me in this context). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before we toss away the type system prevention of Prometheus panics, please note that the Event structure facilitates writing a unit test that validates that the labels from each constructor match the labels that are used to initialize the CounterVec. Or so I've been told :). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think I am advocating that we tossed away the type system. In my diff, there is still a constructor to constrain the labels that can be passed in. E.g., func NewSampleActionSetTotal(action string, namespace string) *Sample {
// ....
} It isn't obvious to me how my diff is hindering the type system protection you are referring to. My intention is still this:
Re: conceptual model, I picked "sample" because it represents a data point instance in the timeseries. Yeah, I agree it doesn't align as well as I'd want, because in a TSDB, a sample of a metric time series is a k/v pair of timestamp to value (with all the label values defined). In this case, we only have the label values, not its value at that given time. Hence, I said, "closer to what you are trying to describe" 🙂. What I have in mind is like I am not convinced "event" is clearer; feels like this is where it gets subjective. I think we agree that type system protection is good, and we want some way to conceptually represent the metric and distinguish it from the metric instance at a given time, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps the concern with your diff is that the -- I agree that I'm not convinced "event" is more clear and that it's a very subjective topic 😄 However, one additional consideration might be: is using "sample" here going to cut off the term we would want to use for gauges in the future (when we add gauge metrics)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unexporting the public fields sound actionable to me. Personally, I am not that worried about the panics caused by initialization with unknown labels 😄, because:
I expect developers to learn and understand the packages they are using (through docs, code review, mentorship etc.), instead of relying on others to fail-proof things for them. I am not opposed to unexporting the public fields though. As for the usage of "sample", yeah, it's not a good choice. I prefer not to overload the TSDB definition of "sample", which is a k/v pair of timestamp to values, regardless of the metric type. I don't have a better name for it, so |
||
// eventType is the MetricType with which the individial Event is associated. | ||
// Labels are the metric labels that will be passed to Prometheus. | ||
// | ||
// Note: The type and labels are private in order to force the use of the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like to see the portion of comment block here at top of this file to explain the relationship between MetricType and constructors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
// event constructors below. This helps to prevent an event from being | ||
// accidentally misconstructed (e.g. with mismatching labels), which would | ||
// cause the Prometheus library to panic. | ||
type Event struct { | ||
eventType MetricType | ||
labels map[string]string | ||
} | ||
|
||
// MetricType returns the event's type. | ||
func (e *Event) Type() MetricType { | ||
return e.eventType | ||
} | ||
|
||
// Labels returns a copy of the event's labels. | ||
func (e *Event) Labels() map[string]string { | ||
labels := make(map[string]string) | ||
for k, v := range e.labels { | ||
labels[k] = v | ||
} | ||
return labels | ||
} | ||
|
||
func NewActionSetCreatedTotal(actionType string, namespace string) Event { | ||
return Event{ | ||
eventType: ActionSetCreatedTotalType, | ||
labels: map[string]string{ | ||
"actionType": actionType, | ||
"namespace": namespace, | ||
}, | ||
} | ||
} | ||
|
||
func NewActionSetCompletedTotal(actionName string, actionType string, blueprint string, namespace string, state string) Event { | ||
return Event{ | ||
eventType: ActionSetCompletedTotalType, | ||
labels: map[string]string{ | ||
"actionName": actionName, | ||
"actionType": actionType, | ||
"blueprint": blueprint, | ||
"namespace": namespace, | ||
"state": state, | ||
}, | ||
} | ||
} | ||
|
||
func NewActionSetFailedTotal(actionName string, actionType string, blueprint string, namespace string, state string) Event { | ||
return Event{ | ||
eventType: ActionSetFailedTotalType, | ||
labels: map[string]string{ | ||
"actionName": actionName, | ||
"actionType": actionType, | ||
"blueprint": blueprint, | ||
"namespace": namespace, | ||
"state": state, | ||
}, | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package metrics | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
|
||
"github.com/kanisterio/kanister/pkg/log" | ||
"github.com/prometheus/client_golang/prometheus" | ||
) | ||
|
||
var counterVecs = make(map[MetricType]*prometheus.CounterVec) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than a global, it would be more idiomatic Go to have a That pointer would be stored in Controller. And then InitAllCounterVecs and IncrementCounterVec would be methods on *CounterVecMap. Otherwise we have a singleton global that can be harder to test and assumes that there is only ever 1 such map in an image. |
||
|
||
// Initialize a Prometheus CounterVec for one metric and register it | ||
func initCounterVec(r prometheus.Registerer, t MetricType) (*prometheus.CounterVec, error) { | ||
metricTypeOpts, ok := MetricCounterOpts[t] | ||
|
||
if !ok { | ||
return nil, fmt.Errorf("Event type %s is not defined", t) | ||
} | ||
|
||
opts := prometheus.CounterOpts{ | ||
Name: string(t), | ||
Help: metricTypeOpts.Help, | ||
} | ||
counterVec := prometheus.NewCounterVec(opts, metricTypeOpts.LabelNames) | ||
|
||
err := r.Register(counterVec) | ||
if err != nil { | ||
return nil, fmt.Errorf("%s not registered: %s ", t, err) | ||
} | ||
var alreadyRegisteredErr prometheus.AlreadyRegisteredError | ||
if errors.As(err, &alreadyRegisteredErr) { | ||
counterVec = alreadyRegisteredErr.ExistingCollector.(*prometheus.CounterVec) | ||
} else if err != nil { | ||
return nil, fmt.Errorf("Error registering Counter Vecs : %s ", err) | ||
} | ||
|
||
return counterVec, nil | ||
} | ||
|
||
// Initialize all the Counter Vecs and save it in a map | ||
func InitAllCounterVecs(r prometheus.Registerer) map[MetricType]*prometheus.CounterVec { | ||
for metricType := range MetricCounterOpts { | ||
cv, err := initCounterVec(r, metricType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are all There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
if err != nil { | ||
log.WithError(err).Print("Failed to register metric %s") | ||
return nil | ||
} | ||
counterVecs[metricType] = cv | ||
} | ||
return counterVecs | ||
} | ||
|
||
// Increment a Counter Vec metric | ||
func IncrementCounterVec(e Event) error { | ||
if counterVecs[e.eventType] == nil { | ||
return fmt.Errorf("%s Event Type not found", e.eventType) | ||
} | ||
cv := counterVecs[e.eventType].With(e.labels) | ||
if cv == nil { | ||
return fmt.Errorf("%s Labels for %s Event Type not found", e.labels, e.eventType) | ||
} | ||
cv.Inc() | ||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the controller need to hold on to this? It doesn't look like the controller is doing anything with it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tested this out without keeping the counters in the controller and metric reporting did not work.
More testing required on this. Or a designated Reporter() for all the metrics.