From f9f6337d43d46bcaa69a22fa1a0205870c27bc25 Mon Sep 17 00:00:00 2001 From: Adam Fabian Date: Fri, 2 Aug 2024 14:24:13 -0500 Subject: [PATCH] feat: add Prometheus pushgateway and metric collection for OVN backup (#381) * feat: add Prometheus pushgateway and metric collection for OVN backup JIRA:OSPC-551 * Add disk percent usage gauge to OVN backup metrics. * Add alerting rules based on collected OVN backup metrics. * typo correction for ovn backup alert * Put prometheus-pushgateway.md in mkdocs.yml --- .../ovn/ovn-backup/ovn-backup.config | 5 + base-kustomize/ovn/ovn-backup/ovn-backup.sh | 167 +++++++- .../prometheus-pushgateway/kustomization.yaml | 7 + .../prometheus-pushgateway/values.yaml | 367 ++++++++++++++++++ base-kustomize/prometheus/alerting_rules.yaml | 39 ++ docs/monitoring-getting-started.md | 5 +- docs/prometheus-monitoring-overview.md | 1 + docs/prometheus-pushgateway.md | 18 + mkdocs.yml | 1 + 9 files changed, 597 insertions(+), 13 deletions(-) create mode 100644 base-kustomize/prometheus-pushgateway/kustomization.yaml create mode 100644 base-kustomize/prometheus-pushgateway/values.yaml create mode 100644 docs/prometheus-pushgateway.md diff --git a/base-kustomize/ovn/ovn-backup/ovn-backup.config b/base-kustomize/ovn/ovn-backup/ovn-backup.config index d377c253..57af1a9f 100644 --- a/base-kustomize/ovn/ovn-backup/ovn-backup.config +++ b/base-kustomize/ovn/ovn-backup/ovn-backup.config @@ -6,6 +6,11 @@ BACKUP_DIR=/backup LOG_FILE=/backup/upload.log LOG_LEVEL=INFO +# Upload metrics to Prometheus +PROMETHEUS_PUSHGATEWAY_URL=http://prometheus-pushgateway.prometheus.svc.cluster.local:9091 +PROMETHEUS_JOB_NAME=ovn-backup +PROMETHEUS_UPLOAD=false + # From here forward, variables for uploading to Swift with tempauth SWIFT_TEMPAUTH_UPLOAD=false # If you change this to "true", set the variables in swift-tempauth.env diff --git a/base-kustomize/ovn/ovn-backup/ovn-backup.sh b/base-kustomize/ovn/ovn-backup/ovn-backup.sh index a3c25083..65b87ffb 100644 --- a/base-kustomize/ovn/ovn-backup/ovn-backup.sh +++ b/base-kustomize/ovn/ovn-backup/ovn-backup.sh @@ -28,7 +28,6 @@ log_level() { ;; esac } -export -f log_level log_line() { local LEVEL @@ -40,7 +39,116 @@ log_line() { echo "$line" | tee -a "$LOG_FILE" fi } -export -f log_line # exported for upload_file + +# Stats files init. These mostly get used to send to Prometheus, but you could +# just read them if you want to. + +STATS_DIR="${BACKUP_DIR}/stats" + +[[ -d "$STATS_DIR" ]] || mkdir "$STATS_DIR" + +declare -A metric_types=( + ["run_count"]="counter" + ["run_timestamp"]="counter" + ["save_pairs_to_disk_success_count"]="counter" + ["save_pairs_to_disk_success_timestamp"]="counter" + ["save_pairs_to_disk_failure_count"]="counter" + ["save_pairs_to_disk_failure_timestamp"]="counter" + ["upload_attempt_count"]="counter" + ["upload_attempt_timestamp"]="counter" + ["upload_pairs_success_count"]="counter" + ["upload_pairs_success_timestamp"]="counter" + ["upload_pairs_failure_count"]="counter" + ["upload_pairs_failure_timestamp"]="counter" + ["disk_files_gauge"]="gauge" + ["disk_used_percent_gauge"]="gauge" + ["swift_objects_gauge"]="gauge" +) + +# Initialize metrics/stats files with 0 if they don't exist +for metric_filename in "${!metric_types[@]}" +do + metric_file_fullname="${STATS_DIR}/$metric_filename" + [[ -e "$metric_file_fullname" ]] || echo "0" > "$metric_file_fullname" +done + +# get_metric takes the metric name, reads the metric file, and echos the value +get_metric() { + local STAT_NAME + local STAT_FULL_FILENAME + STAT_NAME="$1" + STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME" + VALUE="$(cat "$STAT_FULL_FILENAME")" + echo "$VALUE" +} + +# update count $1: stat name, $2 new value +# Used for updating disk file count and Cloud Files object counts. +update_metric() { + local STAT_NAME + local VALUE + STAT_NAME="$1" + VALUE="$2" + STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME" + echo "$VALUE" > "$STAT_FULL_FILENAME" +} + +# increment increments a stats counter $1 by 1 +increment() { + local VALUE + local METRIC_NAME + METRIC_NAME="$1" + VALUE="$(get_metric "$METRIC_NAME")" + ((VALUE++)) + update_metric "$METRIC_NAME" "$VALUE" +} + +# Save epoch time to metric $1 +timestamp_metric() { + local METRIC_NAME + METRIC_NAME="$1" + update_metric "$METRIC_NAME" "$(date +%s)" +} + +increment run_count +timestamp_metric run_timestamp + +finalize_and_upload_metrics() { + local FILE_COUNT + FILE_COUNT=$(find "$BACKUP_DIR" -name \*.backup | wc -l) + update_metric disk_files_gauge "$FILE_COUNT" + local DISK_PERCENT_USED + DISK_PERCENT_USED=$(df "$BACKUP_DIR" | perl -lane 'next unless $. == 2; print int($F[4])') + update_metric disk_used_percent_gauge "$DISK_PERCENT_USED" + local OBJECT_COUNT + if [[ "$SWIFT_TEMPAUTH_UPLOAD" == "true" ]] + then + OBJECT_COUNT=$($SWIFT stat "$CONTAINER" | awk '/Objects:/ { print $2 }') + update_metric swift_objects_gauge "$OBJECT_COUNT" + fi + + if [[ "$PROMETHEUS_UPLOAD" != "true" ]] + then + exit 0 + fi + + for metric in "${!metric_types[@]}" + do + echo "# TYPE $metric ${metric_types[$metric]} +$metric{label=\"ovn-backup\"} $(get_metric "$metric")" | \ + curl -sS \ + "$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/$PROMETHEUS_JOB_NAME" \ + --data-binary @- + done + + # Put metrics in the log if running at DEBUG level. + perl -ne 'print "$ARGV $_"' /backup/stats/* | cut -d / -f 4 | \ + while read -r read_metric + do + log_line DEBUG "run end metric $read_metric" + done +} +trap finalize_and_upload_metrics EXIT INT TERM HUP # Delete old backup files on volume. cd "$BACKUP_DIR" || exit 2 @@ -51,8 +159,29 @@ find "$BACKUP_DIR" -ctime +"$RETENTION_DAYS" -delete; YMD="$(date +"%Y/%m/%d")" # kubectl-ko creates backups in $PWD, so we cd first. mkdir -p "$YMD" && cd "$YMD" || exit 2 -/kube-ovn/kubectl-ko nb backup || log_line ERROR "nb backup failed" -/kube-ovn/kubectl-ko sb backup || log_line ERROR "sb backup failed" + +# This treats the saved failed and success count as a single metric for both +# backups; if either one fails, we increment the failure count, otherwise, +# the success count. +FAILED=false +if ! /kube-ovn/kubectl-ko nb backup +then + log_line ERROR "nb backup failed" + FAILED=true +fi +if ! /kube-ovn/kubectl-ko sb backup +then + log_line ERROR "sb backup failed" + FAILED=true +fi +if [[ "$FAILED" == "true" ]] +then + increment save_pairs_to_disk_failure_count + timestamp_metric save_pairs_to_disk_failure_timestamp +else + increment save_pairs_to_disk_success_count + timestamp_metric save_pairs_to_disk_success_timestamp +fi if [[ "$SWIFT_TEMPAUTH_UPLOAD" != "true" ]] then @@ -63,11 +192,13 @@ fi cd "$BACKUP_DIR" || exit 2 +increment upload_attempt_count +timestamp_metric upload_attempt_timestamp + # Make a working "swift" command SWIFT="kubectl -n openstack exec -i openstack-admin-client -- env -i ST_AUTH=$ST_AUTH ST_USER=$ST_USER ST_KEY=$ST_KEY /var/lib/openstack/bin/swift" -export SWIFT # Create the container if it doesn't exist if ! $SWIFT stat "$CONTAINER" > /dev/null @@ -84,16 +215,30 @@ upload_file() { OBJECT_NAME="$FILE" if $SWIFT upload "$CONTAINER" --object-name "$OBJECT_NAME" - < "$FILE" then - log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME" + log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME to container $CONTAINER" else - log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME" + log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME to container $CONTAINER" + FAILED_UPLOAD=true fi } -export -f upload_file # find created backups and upload them cd "$BACKUP_DIR" || exit 2 -# unusual find syntax to use an exported function from the shell -find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" \ --exec bash -c 'upload_file "$0"' {} \; + +FAILED_UPLOAD=false +find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" | \ +while read -r file +do + upload_file "$file" +done + +if [[ "$FAILED_UPLOAD" == "true" ]] +then + increment upload_pairs_failure_count + timestamp_metric upload_pairs_failure_timestamp +else + increment upload_pairs_success_count + timestamp_metric upload_pairs_success_timestamp +fi + touch "$BACKUP_DIR/last_upload" diff --git a/base-kustomize/prometheus-pushgateway/kustomization.yaml b/base-kustomize/prometheus-pushgateway/kustomization.yaml new file mode 100644 index 00000000..1f589cd6 --- /dev/null +++ b/base-kustomize/prometheus-pushgateway/kustomization.yaml @@ -0,0 +1,7 @@ +helmCharts: + - name: prometheus-pushgateway + repo: https://prometheus-community.github.io/helm-charts + releaseName: prometheus-pushgateway + namespace: prometheus + includeCRDs: true + valuesFile: values.yaml diff --git a/base-kustomize/prometheus-pushgateway/values.yaml b/base-kustomize/prometheus-pushgateway/values.yaml new file mode 100644 index 00000000..f58d1017 --- /dev/null +++ b/base-kustomize/prometheus-pushgateway/values.yaml @@ -0,0 +1,367 @@ +# Default values for prometheus-pushgateway. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# Provide a name in place of prometheus-pushgateway for `app:` labels +nameOverride: "" + +# Provide a name to substitute for the full names of resources +fullnameOverride: "" + +# Provide a namespace to substitude for the namespace on resources +namespaceOverride: "" + +image: + repository: quay.io/prometheus/pushgateway + # if not set appVersion field from Chart.yaml is used + tag: "" + pullPolicy: IfNotPresent + +# Optional pod imagePullSecrets +imagePullSecrets: [] + +service: + type: ClusterIP + port: 9091 + targetPort: 9091 + # nodePort: 32100 + portName: http + + # Optional - Can be used for headless if value is "None" + clusterIP: "" + + ipDualStack: + enabled: false + ipFamilies: ["IPv6", "IPv4"] + ipFamilyPolicy: "PreferDualStack" + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + +# Whether to automatically mount a service account token into the pod +automountServiceAccountToken: true + +# Optional pod annotations +podAnnotations: {} + +# Optional pod labels +podLabels: {} + +# Optional service annotations +serviceAnnotations: {} + +# Optional service labels +serviceLabels: {} + +# Optional serviceAccount labels +serviceAccountLabels: {} + +# Optional persistentVolume labels +persistentVolumeLabels: {} + +# Optional additional environment variables +extraVars: [] + +## Additional pushgateway container arguments +## +## example: +## extraArgs: +## - --persistence.file=/data/pushgateway.data +## - --persistence.interval=5m +extraArgs: [] + +## Additional InitContainers to initialize the pod +## +extraInitContainers: [] + +# Optional additional containers (sidecar) +extraContainers: [] + # - name: oAuth2-proxy + # args: + # - -https-address=:9092 + # - -upstream=http://localhost:9091 + # - -skip-auth-regex=^/metrics + # - -openshift-delegate-urls={"/":{"group":"monitoring.coreos.com","resource":"prometheuses","verb":"get"}} + # image: openshift/oauth-proxy:v1.1.0 + # ports: + # - containerPort: 9092 + # name: proxy + # resources: + # limits: + # memory: 16Mi + # requests: + # memory: 4Mi + # cpu: 20m + # volumeMounts: + # - mountPath: /etc/prometheus/secrets/pushgateway-tls + # name: secret-pushgateway-tls + +resources: + limits: + cpu: 200m + memory: 50Mi + requests: + cpu: 100m + memory: 30Mi + +# -- Sets web configuration +# To enable basic authentication, provide basicAuthUsers as a map +webConfiguration: {} + # basicAuthUsers: + # username: password + +liveness: + enabled: true + probe: + httpGet: + path: /-/healthy + port: 9091 + initialDelaySeconds: 10 + timeoutSeconds: 10 + +readiness: + enabled: true + probe: + httpGet: + path: /-/ready + port: 9091 + initialDelaySeconds: 10 + timeoutSeconds: 10 + +serviceAccount: + # Specifies whether a ServiceAccount should be created + create: true + # The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template + name: + +## Configure ingress resource that allow you to access the +## pushgateway installation. Set up the URL +## ref: http://kubernetes.io/docs/user-guide/ingress/ +## +ingress: + ## Enable Ingress. + ## + enabled: false + # AWS ALB requires path of /* + className: "" + path: / + pathType: ImplementationSpecific + + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + + ## Annotations. + ## + # annotations: + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: 'true' + + ## Hostnames. + ## Must be provided if Ingress is enabled. + ## + # hosts: + # - pushgateway.domain.com + + ## TLS configuration. + ## Secrets must be manually created in the namespace. + ## + # tls: + # - secretName: pushgateway-tls + # hosts: + # - pushgateway.domain.com + +tolerations: [] + # - effect: NoSchedule + # operator: Exists + +## Node labels for pushgateway pod assignment +## Ref: https://kubernetes.io/docs/user-guide/node-selection/ +## +nodeSelector: + openstack-control-plane: enabled +replicaCount: 1 + +hostAliases: [] + # - ip: "127.0.0.1" + # hostnames: + # - "foo.local" + # - "bar.local" + # - ip: "10.1.2.3" + # hostnames: + # - "foo.remote" + # - "bar.remote" + +## When running more than one replica alongside with persistence, different volumes are needed +## per replica, since sharing a `persistence.file` across replicas does not keep metrics synced. +## For this purpose, you can enable the `runAsStatefulSet` to deploy the pushgateway as a +## StatefulSet instead of as a Deployment. +runAsStatefulSet: false + +## Security context to be added to push-gateway pods +## +securityContext: + fsGroup: 65534 + runAsUser: 65534 + runAsNonRoot: true + +## Security context to be added to push-gateway containers +## Having a separate variable as securityContext differs for pods and containers. +containerSecurityContext: {} +# allowPrivilegeEscalation: false +# readOnlyRootFilesystem: true +# runAsUser: 65534 +# runAsNonRoot: true + +## Affinity for pod assignment +## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity +affinity: {} + +## Pod anti-affinity can prevent the scheduler from placing pushgateway replicas on the same node. +## The value "soft" means that the scheduler should *prefer* to not schedule two replica pods onto the same node but no guarantee is provided. +## The value "hard" means that the scheduler is *required* to not schedule two replica pods onto the same node. +## The default value "" will disable pod anti-affinity so that no anti-affinity rules will be configured (unless set in `affinity`). +## +podAntiAffinity: "" + +## If anti-affinity is enabled sets the topologyKey to use for anti-affinity. +## This can be changed to, for example, failure-domain.beta.kubernetes.io/zone +## +podAntiAffinityTopologyKey: kubernetes.io/hostname + +## Topology spread constraints for pods +## Ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ +topologySpreadConstraints: [] + +# Enable this if you're using https://github.com/coreos/prometheus-operator +serviceMonitor: + enabled: true + namespace: prometheus + + # telemetryPath: HTTP resource path from which to fetch metrics. + # Telemetry path, default /metrics, has to be prefixed accordingly if pushgateway sets a route prefix at start-up. + # + telemetryPath: "/metrics" + + # Fallback to the prometheus default unless specified + # interval: 10s + + ## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS. + # scheme: "" + + ## tlsConfig: TLS configuration to use when scraping the endpoint. For example if using istio mTLS. + ## Of type: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#tlsconfig + # tlsConfig: {} + + # bearerTokenFile: + # Fallback to the prometheus default unless specified + # scrapeTimeout: 30s + + ## Used to pass Labels that are used by the Prometheus installed in your cluster to select Service Monitors to work with + ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + additionalLabels: {} + + # Retain the job and instance labels of the metrics pushed to the Pushgateway + # [Scraping Pushgateway](https://github.com/prometheus/pushgateway#configure-the-pushgateway-as-a-target-to-scrape) + honorLabels: true + + ## Metric relabel configs to apply to samples before ingestion. + ## [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) + metricRelabelings: [] + # - action: keep + # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' + # sourceLabels: [__name__] + + ## Relabel configs to apply to samples before ingestion. + ## [Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config) + relabelings: [] + # - sourceLabels: [__meta_kubernetes_pod_node_name] + # separator: ; + # regex: ^(.*)$ + # targetLabel: nodename + # replacement: $1 + # action: replace + +# The values to set in the PodDisruptionBudget spec (minAvailable/maxUnavailable) +# If not set then a PodDisruptionBudget will not be created +podDisruptionBudget: {} + +priorityClassName: + +# Deployment Strategy type +strategy: + type: Recreate + +persistentVolume: + ## If true, pushgateway will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: false + + ## pushgateway data Persistent Volume access modes + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + + ## pushgateway data Persistent Volume Claim annotations + ## + annotations: {} + + ## pushgateway data Persistent Volume existing claim name + ## Requires pushgateway.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "" + + ## pushgateway data Persistent Volume mount root path + ## + mountPath: /data + + ## pushgateway data Persistent Volume size + ## + size: 2Gi + + ## pushgateway data Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + # storageClass: "-" + + ## Subdirectory of pushgateway data Persistent Volume to mount + ## Useful if the volume's root directory is not empty + ## + subPath: "" + +extraVolumes: [] + # - name: extra + # emptyDir: {} +extraVolumeMounts: [] + # - name: extra + # mountPath: /usr/share/extras + # readOnly: true + +# Configuration for clusters with restrictive network policies in place: +# - allowAll allows access to the PushGateway from any namespace +# - customSelector is a list of pod/namespaceSelectors to allow access from +# These options are mutually exclusive and the latter will take precedence. +networkPolicy: {} + # allowAll: true + # customSelectors: + # - namespaceSelector: + # matchLabels: + # type: admin + # - podSelector: + # matchLabels: + # app: myapp + +# Array of extra K8s objects to deploy (evaluated as a template) +# The value can hold an array of strings as well as objects +extraManifests: [] diff --git a/base-kustomize/prometheus/alerting_rules.yaml b/base-kustomize/prometheus/alerting_rules.yaml index 5894c4f1..289a1aad 100644 --- a/base-kustomize/prometheus/alerting_rules.yaml +++ b/base-kustomize/prometheus/alerting_rules.yaml @@ -58,3 +58,42 @@ additionalPrometheusRulesMap: annotations: summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }}) description: "Volume is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + backup-alerts: + groups: + - name: OVN backup alerts + rules: + - alert: ovnBackupUploadWarning + expr: time() - upload_pairs_success_timestamp{job="ovn-backup"} > 21600 # 21600s = 6 hours + # This bases the `expr` on the run normal interval, and allows `for` + # to account for the amount of time to complete and upload before + # alerting. + for: 1h + labels: + severity: warning + annotations: + summary: Last OVN backup not uploaded within 1 hour of scheduled run + description: "Last OVN backup not uploaded within 1 hour of scheduled run" + - alert: ovnBackupUploadCritical + expr: time() - upload_pairs_success_timestamp{job="ovn-backup"} > 43200 + for: 1h + labels: + severity: critical + annotations: + summary: Second successive of OVN backup not uploaded within 1 hour of scheduled run + description: "Second successive of OVN backup not within 1 hour of scheduled run" + - alert: ovnBackupDiskUsageWarning + expr: disk_used_percent_gauge{job="ovn-backup"} > 80 + for: 0m + labels: + severity: warning + annotations: + summary: OVN backup volume >= 80% disk usage + description: "OVN backup volume >= 80% disk usage" + - alert: ovnBackupDiskUsageCritical + expr: disk_used_percent_gauge{job="ovn-backup"} > 90 + for: 0m + labels: + severity: critical + annotations: + summary: OVN backup volume >= 90% disk usage + description: "OVN backup volume >= 90% disk usage" diff --git a/docs/monitoring-getting-started.md b/docs/monitoring-getting-started.md index 52c8841e..6a971e2a 100644 --- a/docs/monitoring-getting-started.md +++ b/docs/monitoring-getting-started.md @@ -22,15 +22,16 @@ We can then deploy our visualization dashboard Grafana Grafana is used to visualize various metrics provided by the monitoring system as well as alerts and logs, take a look at the [Grafana](https://grafana.com/) documentation for more information -## Install the metric exporters +## Install the metric exporters and pushgateway -Now let's deploy our exporters! +Now let's deploy our exporters and pushgateway! * [Mysql Exporter](prometheus-mysql-exporter.md) * [RabbitMQ Exporter](prometheus-rabbitmq-exporter.md) * [Postgres Exporter](prometheus-postgres-exporter.md) * [Memcached Exporter](prometheus-memcached-exporter.md) * [Openstack Exporter](prometheus-openstack-metrics-exporter.md) +* [Pushgateway](prometheus-pushgateway.md) ## Next steps diff --git a/docs/prometheus-monitoring-overview.md b/docs/prometheus-monitoring-overview.md index 4d0ad6ed..119e0756 100644 --- a/docs/prometheus-monitoring-overview.md +++ b/docs/prometheus-monitoring-overview.md @@ -18,6 +18,7 @@ Prometheus makes use of various metric exporters used to collect monitoring data * Postgres Exporter(Postgresql metrics) * Memcached Exporter(Memcached metrics) * Openstack Exporter(Metrics from various Openstack products) +* Pushgateway (metrics from short-lived jobs)
![Prometheus Monitoring Diagram](assets/images/prometheus-monitoring.png){ style="filter:drop-shadow(#3c3c3c 0.5rem 0.5rem 10px);" } diff --git a/docs/prometheus-pushgateway.md b/docs/prometheus-pushgateway.md new file mode 100644 index 00000000..b3814ad1 --- /dev/null +++ b/docs/prometheus-pushgateway.md @@ -0,0 +1,18 @@ +# Prometheus Pushgateway + +_Prometheus_ can use a _pushgateway_ to gather metrics from short-lived jobs, like +Kubernetes _CronJobs_. The pushgateway stays up to allow _Promethus_ to gather +the metrics. The short-lived job can push metrics to the gateway and terminate. + +In particular, _Genestack_ can use the _pushgateway_ to collect metrics from +the OVN backup _CronJob_. + +#### Install the Prometheus Pushgateway Helm Chart + + +``` shell +kubectl kustomize --enable-helm /opt/genestack/base-kustomize/prometheus-pushgateway | kubectl apply -f - +``` + +!!! success + If the installation is successful, you should see the prometheus-pushgateway pod running in the prometheus namespace. diff --git a/mkdocs.yml b/mkdocs.yml index f7110fdf..5690991a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -200,6 +200,7 @@ nav: - Postgres Exporter: prometheus-postgres-exporter.md - Openstack Exporter: prometheus-openstack-metrics-exporter.md - Blackbox Exporter: prometheus-blackbox-exporter.md + - Pushgateway: prometheus-pushgateway.md - Custom Node Metrics: prometheus-custom-node-metrics.md - Alert Manager Examples: - alertmanager-encore.md