Monitoring: Add "NotEnough*" alerts for performance throttling

The following alerts fire when a specific resource has been throttling performance over the last hour: * NotEnoughExecutors: some jobs were assigned a node but had to wait for a free executor to begin * NotEnoughMergers: some merge jobs could not start because there was no available merger * NotEnoughTestNodes: Nodepool couldn't provide test nodes right away Change-Id: I81b0bdbdae32ee1111034d35b24d68248d997c3a
softwarefactory-project · Oct 11, 2023 · c4e91ce · c4e91ce
1 parent ba3b258
commit c4e91ce
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 5 deletions.
diff --git a/controllers/zuul.go b/controllers/zuul.go
@@ -399,11 +399,11 @@ func (r *SFController) EnsureZuulPodMonitor() bool {
 
 // create default alerts
 func (r *SFController) ensureZuulPromRule() bool {
+	/* Alert when a config-update job fails on the config repository */
 	configUpdateFailureInPostAnnotations := map[string]string{
 		"description": "A config-update job failed in the post pipeline. Latest changes might not have been applied. Please check services configurations",
 		"summary":     "config-update failure post merge",
 	}
-
 	configUpdateFailureInPost := monitoring.MkPrometheusAlertRule(
 		"ConfigUpdateFailureInPostPipeline",
 		intstr.FromString(
@@ -413,15 +413,68 @@ func (r *SFController) ensureZuulPromRule() bool {
 		monitoring.CriticalSeverityLabel,
 		configUpdateFailureInPostAnnotations,
 	)
-
 	configRepoRuleGroup := monitoring.MkPrometheusRuleGroup(
 		"config-repository_default.rules",
 		[]monitoringv1.Rule{configUpdateFailureInPost})
+
+	/* Alert when executors are saturated */
+	notEnoughExecutorsAnnotations := map[string]string{
+		"description": "Some jobs have been waiting for an executor to run on in the last hour",
+		"summary":     "Not enough executors",
+	}
+	notEnoughExecutors := monitoring.MkPrometheusAlertRule(
+		"NotEnoughExecutors",
+		intstr.FromString(
+			"rate(zuul_executors_jobs_queued[1h]) > 0"),
+		"1h",
+		monitoring.WarningSeverityLabel,
+		notEnoughExecutorsAnnotations,
+	)
+
+	/* Alert when mergers are saturated */
+	notEnoughMergersAnnotations := map[string]string{
+		"description": "Some merge jobs have been waiting for a merger to run on in the last hour",
+		"summary":     "Not enough mergers",
+	}
+	notEnoughMergers := monitoring.MkPrometheusAlertRule(
+		"NotEnoughMergers",
+		intstr.FromString(
+			"rate(zuul_mergers_jobs_queued[1h]) > 0"),
+		"1h",
+		monitoring.WarningSeverityLabel,
+		notEnoughMergersAnnotations,
+	)
+
+	/* Alert when node requests are saturated */
+	notEnoughNodesAnnotations := map[string]string{
+		"description": "Nodepool had outstanding node requests in the last hour",
+		"summary":     "Not enough testing nodes",
+	}
+	notEnoughNodes := monitoring.MkPrometheusAlertRule(
+		"NotEnoughTestNodes",
+		intstr.FromString(
+			"rate(zuul_nodepool_current_requests[1h]) > 0"),
+		"1h",
+		monitoring.WarningSeverityLabel,
+		notEnoughNodesAnnotations,
+	)
+
+	zuulRuleGroup := monitoring.MkPrometheusRuleGroup(
+		"zuul_default.rules",
+		[]monitoringv1.Rule{
+			notEnoughExecutors,
+			notEnoughMergers,
+			notEnoughNodes,
+		})
+
 	desiredZuulPromRule := monitoring.MkPrometheusRuleCR("zuul-default.rules", r.ns)
-	desiredZuulPromRule.Spec.Groups = append(desiredZuulPromRule.Spec.Groups, configRepoRuleGroup)
+	desiredZuulPromRule.Spec.Groups = append(
+		desiredZuulPromRule.Spec.Groups,
+		configRepoRuleGroup,
+		zuulRuleGroup)
 
 	annotations := map[string]string{
-		"version": "1",
+		"version": "2",
 	}
 	desiredZuulPromRule.ObjectMeta.Annotations = annotations
 	currentPromRule := monitoringv1.PrometheusRule{}

diff --git a/doc/deployment/monitoring.md b/doc/deployment/monitoring.md
@@ -87,7 +87,16 @@ The following alerting rules are created automatically at deployment time:
 | `OutOfDiskNow` | critical | Log server | disk_default.rules | The Log server has less than 10% free storage space left |
 | `OutOfDiskInThreeDays` | warning | Log server | disk_default.rules | Assuming a linear trend, the Log server's storage space will fill up in less than three days |
 | `ConfigUpdateFailureInPostPipeline` | critical | Zuul | config-repository_default.rules | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services |
+| `NotEnoughExecutors` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on |
+| `NotEnoughMergers` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on |
+| `NotEnoughTestNodes` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests |
 | `DIBImageBuildFailure` | warning | nodepool-builder | builder_default.rules | the disk-image-builder service (DIB) failed to build an image |
 | `HighOpenStackAPIError5xxRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of API calls on an OpenStack provider return a status code of 5xx (server-side error) over a period of 15 minutes |
 | `HighFailedStateRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour |
-| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour |
+| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour |
+
+Note that these alerts are generic and might not be relevant to your deployment's specificities.
+For instance, it may be normal to hit the `NotEnoughTestNodes` alert if resource quotas are in place
+on your Nodepool providers.
+
+You are encouraged to [create your own alerts](https://prometheus-operator.dev/docs/user-guides/alerting/#deploying-prometheus-rules), using these ones as a base.