Skip to content

Commit

Permalink
Monitoring: Add statsd-exporter sidecar to nodepool and zuul-scheduler
Browse files Browse the repository at this point in the history
Create a nodepool PodMonitor, extend zuul's Podmonitor to scrape
statsd metrics as well.

Zuul's statsd mapping configuration is minimal and will likely need
to be completed in the future (low hanging fruit, but somewhat tedious
work).

Change-Id: Ife0c02796841b60376b60f803c3e7a2bf2ad5478
  • Loading branch information
mhuin committed Oct 10, 2023
1 parent 0af5f0a commit d9a7059
Show file tree
Hide file tree
Showing 14 changed files with 510 additions and 116 deletions.
5 changes: 5 additions & 0 deletions api/v1/softwarefactory_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ type ZuulExecutorSpec struct {
type ZuulSchedulerSpec struct {
// Storage-related settings
Storage StorageSpec `json:"storage,omitempty"`
// The address to forward statsd metrics to (optional), in the form "host:port"
StatsdTarget string `json:"statsdTarget,omitempty"`
}

// TODO: make sure to update the GetConnectionsName when adding new connection type.
Expand Down Expand Up @@ -193,9 +195,12 @@ type NodepoolBuilderSpec struct {
}

type NodepoolSpec struct {
// Nodepool-launcher related settings
Launcher NodepoolLauncherSpec `json:"launcher,omitempty"`
// Nodepool-builder related settings
Builder NodepoolBuilderSpec `json:"builder,omitempty"`
// The address to forward statsd metrics to (optional), in the form "host:port"
StatsdTarget string `json:"statsdTarget,omitempty"`
}

type ZookeeperSpec struct {
Expand Down
9 changes: 5 additions & 4 deletions cli/sfconfig/cmd/zuul/zuul.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,11 @@ administrative actions on a specified tenant.`,
buffer := &bytes.Buffer{}
errorBuffer := &bytes.Buffer{}
request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulSchedulerContainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{
Command: zuulAdminArgs,
Stdin: false,
Stdout: true,
Stderr: true,
Container: "zuul-scheduler",
Command: zuulAdminArgs,
Stdin: false,
Stdout: true,
Stderr: true,
}, scheme.ParameterCodec)

exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL())
Expand Down
9 changes: 5 additions & 4 deletions cli/sfconfig/cmd/zuul_client/zuul_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,11 @@ Examples:
buf := &bytes.Buffer{}
errBuf := &bytes.Buffer{}
request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulwebcontainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{
Command: zuulClientArgs,
Stdin: false,
Stdout: true,
Stderr: true,
Container: "zuul-web",
Command: zuulClientArgs,
Stdin: false,
Stdout: true,
Stderr: true,
}, scheme.ParameterCodec)

exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ spec:
type: object
type: object
launcher:
description: Nodepool-launcher related settings
properties:
logLevel:
description: 'Specify the Log Level of the nodepool launcher
Expand All @@ -262,6 +263,10 @@ spec:
- DEBUG
type: string
type: object
statsdTarget:
description: The address to forward statsd metrics to (optional),
in the form "host:port"
type: string
type: object
storageClassName:
description: Default storage class to use by Persistent Volume Claims
Expand Down Expand Up @@ -456,6 +461,10 @@ spec:
scheduler:
description: Configuration of the scheduler microservice
properties:
statsdTarget:
description: The address to forward statsd metrics to (optional),
in the form "host:port"
type: string
storage:
description: Storage-related settings
properties:
Expand Down
139 changes: 129 additions & 10 deletions controllers/libs/monitoring/monitoring.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,133 @@
// Copyright (C) 2023 Red Hat
// SPDX-License-Identifier: Apache-2.0

// Package monitoring provides various utility functions regarding monitoring for the sf-operator
/*
Package monitoring provides various utility functions regarding monitoring for the sf-operator:
* create prometheus monitors and alert rules
* create nodeexporter sidecar
* create statsdexporter sidecar
*/
package monitoring

import (
"math"
"strconv"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/softwarefactory-project/sf-operator/controllers/libs/base"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

func GetTruncatedPortName(serviceName string, suffix string) string {
// Port name is limited to 15 chars
var length = float64(len(serviceName))
var maxChars = 15 - float64(len(suffix))
var upper = int(math.Min(maxChars, length))
var exporterPortName = serviceName[:upper] + suffix
return exporterPortName
}

// Node exporter utilities

const NodeExporterNameSuffix = "-nodeexporter"
const NodeExporterPortNameSuffix = "-ne"
const nodeExporterPort = 9100

const NodeExporterImage = "quay.io/prometheus/node-exporter:latest"

// Fun fact: arrays cannot be consts, so we define our args in this function.
func getNodeExporterArgs(volumeMounts []apiv1.VolumeMount) []string {
var excludePaths = "^(/etc/hosts|/etc/hostname|/etc/passwd|/etc/resolv.conf|/run/.containerenv|/run/secrets|/dev|/proc|/sys)($|/)"
return []string{
"--collector.disable-defaults",
"--collector.filesystem",
"--collector.filesystem.mount-points-exclude=" + excludePaths,
}
}

func MkNodeExporterSideCarContainer(serviceName string, volumeMounts []apiv1.VolumeMount) apiv1.Container {
container := base.MkContainer(serviceName+NodeExporterNameSuffix, NodeExporterImage)
container.Args = getNodeExporterArgs(volumeMounts)
container.Ports = []apiv1.ContainerPort{
base.MkContainerPort(nodeExporterPort, GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix)),
}
container.VolumeMounts = volumeMounts
return container
}

func MkNodeExporterSideCarService(serviceName string, namespace string) apiv1.Service {
var portName = GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix)
servicePorts := []int32{nodeExporterPort}
neService := base.MkService(serviceName+NodeExporterPortNameSuffix, namespace, serviceName, servicePorts, portName)
return neService

}

// Statsd exporter utilities

const statsdExporterNameSuffix = "-statsd"
const statsdExporterPortNameSuffix = "-se"
const StatsdExporterPortListen = int32(9125)
const statsdExporterPortExpose = int32(9102)
const StatsdExporterConfigFile = "statsd_mapping.yaml"
const statsdExporterImage = "quay.io/prometheus/statsd-exporter:v0.24.0"

func getStatsdExporterArgs(configPath string, relayAddress *string) []string {
args := []string{
"--statsd.mapping-config=" + configPath,
"--statsd.listen-udp=:" + strconv.Itoa(int(StatsdExporterPortListen)),
"--web.listen-address=:" + strconv.Itoa(int(statsdExporterPortExpose)),
}
if relayAddress != nil {
args = append(args, "--statsd.relay.address="+*relayAddress)
}
return args
}

func GetStatsdExporterPort(serviceName string) string {
return GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"e")
}

func MkStatsdExporterSideCarContainer(serviceName string, configVolumeName string, relayAddress *string) apiv1.Container {
var seListenPortName = GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"l")
var seExposePortName = GetStatsdExporterPort(serviceName)
var configFile = StatsdExporterConfigFile
var configPath = "/tmp/" + configFile
// var configVolumeName = serviceName + "-statsd-conf"

volumeMounts := []apiv1.VolumeMount{
{
Name: configVolumeName,
MountPath: configPath,
SubPath: configFile,
},
}
args := getStatsdExporterArgs(configPath, relayAddress)
ports := []apiv1.ContainerPort{
{
Name: seListenPortName,
Protocol: apiv1.ProtocolUDP,
ContainerPort: StatsdExporterPortListen,
},
{
Name: seExposePortName,
Protocol: apiv1.ProtocolTCP,
ContainerPort: statsdExporterPortExpose,
},
}
sidecar := base.MkContainer(serviceName+statsdExporterNameSuffix, statsdExporterImage)
sidecar.Args = args
sidecar.VolumeMounts = volumeMounts
sidecar.Ports = ports

return sidecar
}

// Prometheus utilities

// ServiceMonitorLabelSelector - TODO this could be a spec parameter.
const ServiceMonitorLabelSelector = "sf-monitoring"

Expand All @@ -34,7 +152,7 @@ func MkPrometheusAlertRule(name string, expr intstr.IntOrString, forDuration str
}

//lint:ignore U1000 this function will be used in a followup change
func mkServiceMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor {
func mkServiceMonitor(name string, ns string, portName string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor {
return monitoringv1.ServiceMonitor{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Expand All @@ -47,7 +165,7 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label
Endpoints: []monitoringv1.Endpoint{
{
Interval: monitoringv1.Duration("30s"),
Port: port,
Port: portName,
Scheme: "http",
},
},
Expand All @@ -56,7 +174,12 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label
}
}

func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.PodMonitor {
func MkPodMonitor(name string, ns string, ports []string, selector metav1.LabelSelector) monitoringv1.PodMonitor {
endpoints := []monitoringv1.PodMetricsEndpoint{}
for _, port := range ports {
endpoints = append(endpoints, monitoringv1.PodMetricsEndpoint{Port: port})
}

return monitoringv1.PodMonitor{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Expand All @@ -66,12 +189,8 @@ func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSele
},
},
Spec: monitoringv1.PodMonitorSpec{
Selector: selector,
PodMetricsEndpoints: []monitoringv1.PodMetricsEndpoint{
{
Port: port,
},
},
Selector: selector,
PodMetricsEndpoints: endpoints,
},
}
}
Expand Down
19 changes: 10 additions & 9 deletions controllers/logserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (

"github.com/softwarefactory-project/sf-operator/controllers/libs/base"
"github.com/softwarefactory-project/sf-operator/controllers/libs/conds"
"github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring"
sfmonitoring "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring"
"github.com/softwarefactory-project/sf-operator/controllers/libs/utils"
)

Expand Down Expand Up @@ -100,8 +100,8 @@ func (r *LogServerController) ensureLogserverPodMonitor() bool {
"run": logserverIdent,
},
}
nePort := GetNodeexporterPortName(logserverIdent)
desiredLsPodmonitor := monitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, nePort, selector)
nePort := sfmonitoring.GetTruncatedPortName(logserverIdent, sfmonitoring.NodeExporterPortNameSuffix)
desiredLsPodmonitor := sfmonitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, []string{nePort}, selector)
// add annotations so we can handle lifecycle
annotations := map[string]string{
"version": "1",
Expand Down Expand Up @@ -137,7 +137,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool {
"description": "Log server only has at most three days' worth ({{ $value | humanize1024 }}) of free disk available.",
"summary": "Log server running out of disk",
}
diskFull := monitoring.MkPrometheusAlertRule(
diskFull := sfmonitoring.MkPrometheusAlertRule(
"OutOfDiskNow",
intstr.FromString(
"(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+
Expand All @@ -147,7 +147,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool {
diskFullLabels,
diskFullAnnotations,
)
diskFullIn3days := monitoring.MkPrometheusAlertRule(
diskFullIn3days := sfmonitoring.MkPrometheusAlertRule(
"OutOfDiskInThreeDays",
intstr.FromString(
"(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+
Expand All @@ -158,10 +158,10 @@ func (r *LogServerController) ensureLogserverPromRule() bool {
map[string]string{},
diskFull3daysAnnotations,
)
lsDiskRuleGroup := monitoring.MkPrometheusRuleGroup(
lsDiskRuleGroup := sfmonitoring.MkPrometheusRuleGroup(
"disk.rules",
[]monitoringv1.Rule{diskFull, diskFullIn3days})
desiredLsPromRule := monitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns)
desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns)
desiredLsPromRule.Spec.Groups = append(desiredLsPromRule.Spec.Groups, lsDiskRuleGroup)

// add annotations so we can handle lifecycle
Expand Down Expand Up @@ -345,7 +345,7 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus {
},
}

statsExporter := createNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter)
statsExporter := sfmonitoring.MkNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter)
dep.Spec.Template.Spec.Containers = append(dep.Spec.Template.Spec.Containers, statsExporter)

// Increase serial each time you need to enforce a deployment change/pod restart between operator versions
Expand Down Expand Up @@ -378,7 +378,8 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus {
sshdService := base.MkService(sshdPortName, r.ns, logserverIdent, sshdServicePorts, sshdPortName)
r.GetOrCreate(&sshdService)

r.getOrCreateNodeExporterSideCarService(logserverIdent)
nodeExporterSidecarService := sfmonitoring.MkNodeExporterSideCarService(logserverIdent, r.ns)
r.GetOrCreate(&nodeExporterSidecarService)

pvcReadiness := r.reconcileExpandPVC(logserverIdent, r.cr.Spec.Settings.Storage)

Expand Down
Loading

0 comments on commit d9a7059

Please sign in to comment.