Skip to content

Commit

Permalink
Monitoring: Add "NotEnough*" alerts for performance throttling
Browse files Browse the repository at this point in the history
The following alerts fire when a specific resource has been throttling
performance over the last hour:

* NotEnoughExecutors: some jobs were assigned a node but had to wait for
  a free executor to begin
* NotEnoughMergers: some merge jobs could not start because there was
  no available merger
* NotEnoughTestNodes: Nodepool couldn't provide test nodes right away

Change-Id: I81b0bdbdae32ee1111034d35b24d68248d997c3a
  • Loading branch information
mhuin authored and morucci committed Oct 11, 2023
1 parent ba3b258 commit c4e91ce
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 5 deletions.
61 changes: 57 additions & 4 deletions controllers/zuul.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,11 +399,11 @@ func (r *SFController) EnsureZuulPodMonitor() bool {

// create default alerts
func (r *SFController) ensureZuulPromRule() bool {
/* Alert when a config-update job fails on the config repository */
configUpdateFailureInPostAnnotations := map[string]string{
"description": "A config-update job failed in the post pipeline. Latest changes might not have been applied. Please check services configurations",
"summary": "config-update failure post merge",
}

configUpdateFailureInPost := monitoring.MkPrometheusAlertRule(
"ConfigUpdateFailureInPostPipeline",
intstr.FromString(
Expand All @@ -413,15 +413,68 @@ func (r *SFController) ensureZuulPromRule() bool {
monitoring.CriticalSeverityLabel,
configUpdateFailureInPostAnnotations,
)

configRepoRuleGroup := monitoring.MkPrometheusRuleGroup(
"config-repository_default.rules",
[]monitoringv1.Rule{configUpdateFailureInPost})

/* Alert when executors are saturated */
notEnoughExecutorsAnnotations := map[string]string{
"description": "Some jobs have been waiting for an executor to run on in the last hour",
"summary": "Not enough executors",
}
notEnoughExecutors := monitoring.MkPrometheusAlertRule(
"NotEnoughExecutors",
intstr.FromString(
"rate(zuul_executors_jobs_queued[1h]) > 0"),
"1h",
monitoring.WarningSeverityLabel,
notEnoughExecutorsAnnotations,
)

/* Alert when mergers are saturated */
notEnoughMergersAnnotations := map[string]string{
"description": "Some merge jobs have been waiting for a merger to run on in the last hour",
"summary": "Not enough mergers",
}
notEnoughMergers := monitoring.MkPrometheusAlertRule(
"NotEnoughMergers",
intstr.FromString(
"rate(zuul_mergers_jobs_queued[1h]) > 0"),
"1h",
monitoring.WarningSeverityLabel,
notEnoughMergersAnnotations,
)

/* Alert when node requests are saturated */
notEnoughNodesAnnotations := map[string]string{
"description": "Nodepool had outstanding node requests in the last hour",
"summary": "Not enough testing nodes",
}
notEnoughNodes := monitoring.MkPrometheusAlertRule(
"NotEnoughTestNodes",
intstr.FromString(
"rate(zuul_nodepool_current_requests[1h]) > 0"),
"1h",
monitoring.WarningSeverityLabel,
notEnoughNodesAnnotations,
)

zuulRuleGroup := monitoring.MkPrometheusRuleGroup(
"zuul_default.rules",
[]monitoringv1.Rule{
notEnoughExecutors,
notEnoughMergers,
notEnoughNodes,
})

desiredZuulPromRule := monitoring.MkPrometheusRuleCR("zuul-default.rules", r.ns)
desiredZuulPromRule.Spec.Groups = append(desiredZuulPromRule.Spec.Groups, configRepoRuleGroup)
desiredZuulPromRule.Spec.Groups = append(
desiredZuulPromRule.Spec.Groups,
configRepoRuleGroup,
zuulRuleGroup)

annotations := map[string]string{
"version": "1",
"version": "2",
}
desiredZuulPromRule.ObjectMeta.Annotations = annotations
currentPromRule := monitoringv1.PrometheusRule{}
Expand Down
11 changes: 10 additions & 1 deletion doc/deployment/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,16 @@ The following alerting rules are created automatically at deployment time:
| `OutOfDiskNow` | critical | Log server | disk_default.rules | The Log server has less than 10% free storage space left |
| `OutOfDiskInThreeDays` | warning | Log server | disk_default.rules | Assuming a linear trend, the Log server's storage space will fill up in less than three days |
| `ConfigUpdateFailureInPostPipeline` | critical | Zuul | config-repository_default.rules | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services |
| `NotEnoughExecutors` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on |
| `NotEnoughMergers` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on |
| `NotEnoughTestNodes` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests |
| `DIBImageBuildFailure` | warning | nodepool-builder | builder_default.rules | the disk-image-builder service (DIB) failed to build an image |
| `HighOpenStackAPIError5xxRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of API calls on an OpenStack provider return a status code of 5xx (server-side error) over a period of 15 minutes |
| `HighFailedStateRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour |
| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour |
| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour |

Note that these alerts are generic and might not be relevant to your deployment's specificities.
For instance, it may be normal to hit the `NotEnoughTestNodes` alert if resource quotas are in place
on your Nodepool providers.

You are encouraged to [create your own alerts](https://prometheus-operator.dev/docs/user-guides/alerting/#deploying-prometheus-rules), using these ones as a base.

0 comments on commit c4e91ce

Please sign in to comment.