From f9f6337d43d46bcaa69a22fa1a0205870c27bc25 Mon Sep 17 00:00:00 2001
From: Adam Fabian <adam.fabian@rackspace.com>
Date: Fri, 2 Aug 2024 14:24:13 -0500
Subject: [PATCH] feat: add Prometheus pushgateway and metric collection for
 OVN backup (#381)

* feat: add Prometheus pushgateway and metric collection for OVN backup

JIRA:OSPC-551

* Add disk percent usage gauge to OVN backup metrics.

* Add alerting rules based on collected OVN backup metrics.

* typo correction for ovn backup alert

* Put prometheus-pushgateway.md in mkdocs.yml
---
 .../ovn/ovn-backup/ovn-backup.config          |   5 +
 base-kustomize/ovn/ovn-backup/ovn-backup.sh   | 167 +++++++-
 .../prometheus-pushgateway/kustomization.yaml |   7 +
 .../prometheus-pushgateway/values.yaml        | 367 ++++++++++++++++++
 base-kustomize/prometheus/alerting_rules.yaml |  39 ++
 docs/monitoring-getting-started.md            |   5 +-
 docs/prometheus-monitoring-overview.md        |   1 +
 docs/prometheus-pushgateway.md                |  18 +
 mkdocs.yml                                    |   1 +
 9 files changed, 597 insertions(+), 13 deletions(-)
 create mode 100644 base-kustomize/prometheus-pushgateway/kustomization.yaml
 create mode 100644 base-kustomize/prometheus-pushgateway/values.yaml
 create mode 100644 docs/prometheus-pushgateway.md

diff --git a/base-kustomize/ovn/ovn-backup/ovn-backup.config b/base-kustomize/ovn/ovn-backup/ovn-backup.config
index d377c253..57af1a9f 100644
--- a/base-kustomize/ovn/ovn-backup/ovn-backup.config
+++ b/base-kustomize/ovn/ovn-backup/ovn-backup.config
@@ -6,6 +6,11 @@ BACKUP_DIR=/backup
 LOG_FILE=/backup/upload.log
 LOG_LEVEL=INFO
 
+# Upload metrics to Prometheus
+PROMETHEUS_PUSHGATEWAY_URL=http://prometheus-pushgateway.prometheus.svc.cluster.local:9091
+PROMETHEUS_JOB_NAME=ovn-backup
+PROMETHEUS_UPLOAD=false
+
 # From here forward, variables for uploading to Swift with tempauth
 SWIFT_TEMPAUTH_UPLOAD=false
 # If you change this to "true", set the variables in swift-tempauth.env
diff --git a/base-kustomize/ovn/ovn-backup/ovn-backup.sh b/base-kustomize/ovn/ovn-backup/ovn-backup.sh
index a3c25083..65b87ffb 100644
--- a/base-kustomize/ovn/ovn-backup/ovn-backup.sh
+++ b/base-kustomize/ovn/ovn-backup/ovn-backup.sh
@@ -28,7 +28,6 @@ log_level() {
         ;;
     esac
 }
-export -f log_level
 
 log_line() {
     local LEVEL
@@ -40,7 +39,116 @@ log_line() {
         echo "$line" | tee -a "$LOG_FILE"
     fi
 }
-export -f log_line # exported for upload_file
+
+# Stats files init. These mostly get used to send to Prometheus, but you could
+# just read them if you want to.
+
+STATS_DIR="${BACKUP_DIR}/stats"
+
+[[ -d "$STATS_DIR" ]] || mkdir "$STATS_DIR"
+
+declare -A metric_types=(
+    ["run_count"]="counter"
+    ["run_timestamp"]="counter"
+    ["save_pairs_to_disk_success_count"]="counter"
+    ["save_pairs_to_disk_success_timestamp"]="counter"
+    ["save_pairs_to_disk_failure_count"]="counter"
+    ["save_pairs_to_disk_failure_timestamp"]="counter"
+    ["upload_attempt_count"]="counter"
+    ["upload_attempt_timestamp"]="counter"
+    ["upload_pairs_success_count"]="counter"
+    ["upload_pairs_success_timestamp"]="counter"
+    ["upload_pairs_failure_count"]="counter"
+    ["upload_pairs_failure_timestamp"]="counter"
+    ["disk_files_gauge"]="gauge"
+    ["disk_used_percent_gauge"]="gauge"
+    ["swift_objects_gauge"]="gauge"
+)
+
+# Initialize metrics/stats files with 0 if they don't exist
+for metric_filename in "${!metric_types[@]}"
+do
+    metric_file_fullname="${STATS_DIR}/$metric_filename"
+    [[ -e "$metric_file_fullname" ]] || echo "0" > "$metric_file_fullname"
+done
+
+# get_metric takes the metric name, reads the metric file, and echos the value
+get_metric() {
+    local STAT_NAME
+    local STAT_FULL_FILENAME
+    STAT_NAME="$1"
+    STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME"
+    VALUE="$(cat "$STAT_FULL_FILENAME")"
+    echo "$VALUE"
+}
+
+# update count $1: stat name, $2 new value
+# Used for updating disk file count and Cloud Files object counts.
+update_metric() {
+    local STAT_NAME
+    local VALUE
+    STAT_NAME="$1"
+    VALUE="$2"
+    STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME"
+    echo "$VALUE" > "$STAT_FULL_FILENAME"
+}
+
+# increment increments a stats counter $1 by 1
+increment() {
+    local VALUE
+    local METRIC_NAME
+    METRIC_NAME="$1"
+    VALUE="$(get_metric "$METRIC_NAME")"
+    ((VALUE++))
+    update_metric "$METRIC_NAME" "$VALUE"
+}
+
+# Save epoch time to metric $1
+timestamp_metric() {
+    local METRIC_NAME
+    METRIC_NAME="$1"
+    update_metric "$METRIC_NAME" "$(date +%s)"
+}
+
+increment run_count
+timestamp_metric run_timestamp
+
+finalize_and_upload_metrics() {
+    local FILE_COUNT
+    FILE_COUNT=$(find "$BACKUP_DIR" -name \*.backup | wc -l)
+    update_metric disk_files_gauge "$FILE_COUNT"
+    local DISK_PERCENT_USED
+    DISK_PERCENT_USED=$(df "$BACKUP_DIR" | perl -lane 'next unless $. == 2; print int($F[4])')
+    update_metric disk_used_percent_gauge "$DISK_PERCENT_USED"
+    local OBJECT_COUNT
+    if [[ "$SWIFT_TEMPAUTH_UPLOAD" == "true" ]]
+    then
+        OBJECT_COUNT=$($SWIFT stat "$CONTAINER" | awk '/Objects:/ { print $2 }')
+        update_metric swift_objects_gauge "$OBJECT_COUNT"
+    fi
+
+    if [[ "$PROMETHEUS_UPLOAD" != "true" ]]
+    then
+        exit 0
+    fi
+
+    for metric in "${!metric_types[@]}"
+    do
+        echo "# TYPE $metric ${metric_types[$metric]}
+$metric{label=\"ovn-backup\"} $(get_metric "$metric")" | \
+        curl -sS \
+          "$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/$PROMETHEUS_JOB_NAME" \
+          --data-binary @-
+    done
+
+    # Put metrics in the log if running at DEBUG level.
+    perl -ne 'print "$ARGV $_"' /backup/stats/* | cut -d / -f 4 | \
+    while read -r read_metric
+    do
+        log_line DEBUG "run end metric $read_metric"
+    done
+}
+trap finalize_and_upload_metrics EXIT INT TERM HUP
 
 # Delete old backup files on volume.
 cd "$BACKUP_DIR" || exit 2
@@ -51,8 +159,29 @@ find "$BACKUP_DIR" -ctime +"$RETENTION_DAYS" -delete;
 YMD="$(date +"%Y/%m/%d")"
 # kubectl-ko creates backups in $PWD, so we cd first.
 mkdir -p "$YMD" && cd "$YMD" || exit 2
-/kube-ovn/kubectl-ko nb backup || log_line ERROR "nb backup failed"
-/kube-ovn/kubectl-ko sb backup || log_line ERROR "sb backup failed"
+
+# This treats the saved failed and success count as a single metric for both
+# backups; if either one fails, we increment the failure count, otherwise,
+# the success count.
+FAILED=false
+if ! /kube-ovn/kubectl-ko nb backup
+then
+    log_line ERROR "nb backup failed"
+    FAILED=true
+fi
+if ! /kube-ovn/kubectl-ko sb backup
+then
+    log_line ERROR "sb backup failed"
+    FAILED=true
+fi
+if [[ "$FAILED" == "true" ]]
+then
+    increment save_pairs_to_disk_failure_count
+    timestamp_metric save_pairs_to_disk_failure_timestamp
+else
+    increment save_pairs_to_disk_success_count
+    timestamp_metric save_pairs_to_disk_success_timestamp
+fi
 
 if [[ "$SWIFT_TEMPAUTH_UPLOAD" != "true" ]]
 then
@@ -63,11 +192,13 @@ fi
 
 cd "$BACKUP_DIR" || exit 2
 
+increment upload_attempt_count
+timestamp_metric upload_attempt_timestamp
+
 # Make a working "swift" command
 SWIFT="kubectl -n openstack exec -i openstack-admin-client --
 env -i ST_AUTH=$ST_AUTH ST_USER=$ST_USER ST_KEY=$ST_KEY
 /var/lib/openstack/bin/swift"
-export SWIFT
 
 # Create the container if it doesn't exist
 if ! $SWIFT stat "$CONTAINER" > /dev/null
@@ -84,16 +215,30 @@ upload_file() {
     OBJECT_NAME="$FILE"
     if $SWIFT upload "$CONTAINER" --object-name "$OBJECT_NAME" - < "$FILE"
     then
-      log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME"
+      log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME to container $CONTAINER"
     else
-      log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME"
+      log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME to container $CONTAINER"
+      FAILED_UPLOAD=true
     fi
 }
-export -f upload_file
 
 # find created backups and upload them
 cd "$BACKUP_DIR" || exit 2
-# unusual find syntax to use an exported function from the shell
-find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" \
--exec bash -c 'upload_file "$0"' {} \;
+
+FAILED_UPLOAD=false
+find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" | \
+while read -r file
+do
+    upload_file "$file"
+done
+
+if [[ "$FAILED_UPLOAD" == "true" ]]
+then
+    increment upload_pairs_failure_count
+    timestamp_metric upload_pairs_failure_timestamp
+else
+    increment upload_pairs_success_count
+    timestamp_metric upload_pairs_success_timestamp
+fi
+
 touch "$BACKUP_DIR/last_upload"
diff --git a/base-kustomize/prometheus-pushgateway/kustomization.yaml b/base-kustomize/prometheus-pushgateway/kustomization.yaml
new file mode 100644
index 00000000..1f589cd6
--- /dev/null
+++ b/base-kustomize/prometheus-pushgateway/kustomization.yaml
@@ -0,0 +1,7 @@
+helmCharts:
+  - name: prometheus-pushgateway
+    repo: https://prometheus-community.github.io/helm-charts
+    releaseName: prometheus-pushgateway
+    namespace: prometheus
+    includeCRDs: true
+    valuesFile: values.yaml
diff --git a/base-kustomize/prometheus-pushgateway/values.yaml b/base-kustomize/prometheus-pushgateway/values.yaml
new file mode 100644
index 00000000..f58d1017
--- /dev/null
+++ b/base-kustomize/prometheus-pushgateway/values.yaml
@@ -0,0 +1,367 @@
+# Default values for prometheus-pushgateway.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# Provide a name in place of prometheus-pushgateway for `app:` labels
+nameOverride: ""
+
+# Provide a name to substitute for the full names of resources
+fullnameOverride: ""
+
+# Provide a namespace to substitude for the namespace on resources
+namespaceOverride: ""
+
+image:
+  repository: quay.io/prometheus/pushgateway
+  # if not set appVersion field from Chart.yaml is used
+  tag: ""
+  pullPolicy: IfNotPresent
+
+# Optional pod imagePullSecrets
+imagePullSecrets: []
+
+service:
+  type: ClusterIP
+  port: 9091
+  targetPort: 9091
+  # nodePort: 32100
+  portName: http
+
+  # Optional - Can be used for headless if value is "None"
+  clusterIP: ""
+
+  ipDualStack:
+    enabled: false
+    ipFamilies: ["IPv6", "IPv4"]
+    ipFamilyPolicy: "PreferDualStack"
+
+  loadBalancerIP: ""
+  loadBalancerSourceRanges: []
+
+# Whether to automatically mount a service account token into the pod
+automountServiceAccountToken: true
+
+# Optional pod annotations
+podAnnotations: {}
+
+# Optional pod labels
+podLabels: {}
+
+# Optional service annotations
+serviceAnnotations: {}
+
+# Optional service labels
+serviceLabels: {}
+
+# Optional serviceAccount labels
+serviceAccountLabels: {}
+
+# Optional persistentVolume labels
+persistentVolumeLabels: {}
+
+# Optional additional environment variables
+extraVars: []
+
+## Additional pushgateway container arguments
+##
+## example:
+## extraArgs:
+##  - --persistence.file=/data/pushgateway.data
+##  - --persistence.interval=5m
+extraArgs: []
+
+## Additional InitContainers to initialize the pod
+##
+extraInitContainers: []
+
+# Optional additional containers (sidecar)
+extraContainers: []
+  # - name: oAuth2-proxy
+  #   args:
+  #     - -https-address=:9092
+  #     - -upstream=http://localhost:9091
+  #     - -skip-auth-regex=^/metrics
+  #     - -openshift-delegate-urls={"/":{"group":"monitoring.coreos.com","resource":"prometheuses","verb":"get"}}
+  #   image: openshift/oauth-proxy:v1.1.0
+  #   ports:
+  #       - containerPort: 9092
+  #         name: proxy
+  #   resources:
+  #       limits:
+  #         memory: 16Mi
+  #       requests:
+  #         memory: 4Mi
+  #         cpu: 20m
+  #   volumeMounts:
+  #     - mountPath: /etc/prometheus/secrets/pushgateway-tls
+  #       name: secret-pushgateway-tls
+
+resources:
+  limits:
+    cpu: 200m
+    memory: 50Mi
+  requests:
+    cpu: 100m
+    memory: 30Mi
+
+# -- Sets web configuration
+# To enable basic authentication, provide basicAuthUsers as a map
+webConfiguration: {}
+  # basicAuthUsers:
+  #   username: password
+
+liveness:
+  enabled: true
+  probe:
+    httpGet:
+      path: /-/healthy
+      port: 9091
+    initialDelaySeconds: 10
+    timeoutSeconds: 10
+
+readiness:
+  enabled: true
+  probe:
+    httpGet:
+      path: /-/ready
+      port: 9091
+    initialDelaySeconds: 10
+    timeoutSeconds: 10
+
+serviceAccount:
+  # Specifies whether a ServiceAccount should be created
+  create: true
+  # The name of the ServiceAccount to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name:
+
+## Configure ingress resource that allow you to access the
+## pushgateway installation. Set up the URL
+## ref: http://kubernetes.io/docs/user-guide/ingress/
+##
+ingress:
+  ## Enable Ingress.
+  ##
+  enabled: false
+  # AWS ALB requires path of /*
+  className: ""
+  path: /
+  pathType: ImplementationSpecific
+
+  ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
+  extraPaths: []
+  # - path: /*
+  #   backend:
+  #     serviceName: ssl-redirect
+  #     servicePort: use-annotation
+
+  ## Annotations.
+  ##
+  # annotations:
+  #   kubernetes.io/ingress.class: nginx
+  #   kubernetes.io/tls-acme: 'true'
+
+  ## Hostnames.
+  ## Must be provided if Ingress is enabled.
+  ##
+  # hosts:
+  #   - pushgateway.domain.com
+
+  ## TLS configuration.
+  ## Secrets must be manually created in the namespace.
+  ##
+  # tls:
+  #   - secretName: pushgateway-tls
+  #     hosts:
+  #       - pushgateway.domain.com
+
+tolerations: []
+  # - effect: NoSchedule
+  #   operator: Exists
+
+## Node labels for pushgateway pod assignment
+## Ref: https://kubernetes.io/docs/user-guide/node-selection/
+##
+nodeSelector:
+  openstack-control-plane: enabled
+replicaCount: 1
+
+hostAliases: []
+  # - ip: "127.0.0.1"
+  #   hostnames:
+  #   - "foo.local"
+  #   - "bar.local"
+  # - ip: "10.1.2.3"
+  #   hostnames:
+  #   - "foo.remote"
+  #   - "bar.remote"
+
+## When running more than one replica alongside with persistence, different volumes are needed
+## per replica, since sharing a `persistence.file` across replicas does not keep metrics synced.
+## For this purpose, you can enable the `runAsStatefulSet` to deploy the pushgateway as a
+## StatefulSet instead of as a Deployment.
+runAsStatefulSet: false
+
+## Security context to be added to push-gateway pods
+##
+securityContext:
+  fsGroup: 65534
+  runAsUser: 65534
+  runAsNonRoot: true
+
+## Security context to be added to push-gateway containers
+## Having a separate variable as securityContext differs for pods and containers.
+containerSecurityContext: {}
+#  allowPrivilegeEscalation: false
+#  readOnlyRootFilesystem: true
+#  runAsUser: 65534
+#  runAsNonRoot: true
+
+## Affinity for pod assignment
+## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
+affinity: {}
+
+## Pod anti-affinity can prevent the scheduler from placing pushgateway replicas on the same node.
+## The value "soft" means that the scheduler should *prefer* to not schedule two replica pods onto the same node but no guarantee is provided.
+## The value "hard" means that the scheduler is *required* to not schedule two replica pods onto the same node.
+## The default value "" will disable pod anti-affinity so that no anti-affinity rules will be configured (unless set in `affinity`).
+##
+podAntiAffinity: ""
+
+## If anti-affinity is enabled sets the topologyKey to use for anti-affinity.
+## This can be changed to, for example, failure-domain.beta.kubernetes.io/zone
+##
+podAntiAffinityTopologyKey: kubernetes.io/hostname
+
+## Topology spread constraints for pods
+## Ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
+topologySpreadConstraints: []
+
+# Enable this if you're using https://github.com/coreos/prometheus-operator
+serviceMonitor:
+  enabled: true
+  namespace: prometheus
+
+  # telemetryPath: HTTP resource path from which to fetch metrics.
+  # Telemetry path, default /metrics, has to be prefixed accordingly if pushgateway sets a route prefix at start-up.
+  #
+  telemetryPath: "/metrics"
+
+  # Fallback to the prometheus default unless specified
+  # interval: 10s
+
+  ## scheme: HTTP scheme to use for scraping. Can be used with `tlsConfig` for example if using istio mTLS.
+  # scheme: ""
+
+  ## tlsConfig: TLS configuration to use when scraping the endpoint. For example if using istio mTLS.
+  ## Of type: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#tlsconfig
+  # tlsConfig: {}
+
+  # bearerTokenFile:
+  # Fallback to the prometheus default unless specified
+  # scrapeTimeout: 30s
+
+  ## Used to pass Labels that are used by the Prometheus installed in your cluster to select Service Monitors to work with
+  ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
+  additionalLabels: {}
+
+  # Retain the job and instance labels of the metrics pushed to the Pushgateway
+  # [Scraping Pushgateway](https://github.com/prometheus/pushgateway#configure-the-pushgateway-as-a-target-to-scrape)
+  honorLabels: true
+
+  ## Metric relabel configs to apply to samples before ingestion.
+  ## [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
+  metricRelabelings: []
+  # - action: keep
+  #   regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
+  #   sourceLabels: [__name__]
+
+  ## Relabel configs to apply to samples before ingestion.
+  ## [Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config)
+  relabelings: []
+  # - sourceLabels: [__meta_kubernetes_pod_node_name]
+  #   separator: ;
+  #   regex: ^(.*)$
+  #   targetLabel: nodename
+  #   replacement: $1
+  #   action: replace
+
+# The values to set in the PodDisruptionBudget spec (minAvailable/maxUnavailable)
+# If not set then a PodDisruptionBudget will not be created
+podDisruptionBudget: {}
+
+priorityClassName:
+
+# Deployment Strategy type
+strategy:
+  type: Recreate
+
+persistentVolume:
+  ## If true, pushgateway will create/use a Persistent Volume Claim
+  ## If false, use emptyDir
+  ##
+  enabled: false
+
+  ## pushgateway data Persistent Volume access modes
+  ## Must match those of existing PV or dynamic provisioner
+  ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
+  ##
+  accessModes:
+    - ReadWriteOnce
+
+  ## pushgateway data Persistent Volume Claim annotations
+  ##
+  annotations: {}
+
+  ## pushgateway data Persistent Volume existing claim name
+  ## Requires pushgateway.persistentVolume.enabled: true
+  ## If defined, PVC must be created manually before volume will be bound
+  existingClaim: ""
+
+  ## pushgateway data Persistent Volume mount root path
+  ##
+  mountPath: /data
+
+  ## pushgateway data Persistent Volume size
+  ##
+  size: 2Gi
+
+  ## pushgateway data Persistent Volume Storage Class
+  ## If defined, storageClassName: <storageClass>
+  ## If set to "-", storageClassName: "", which disables dynamic provisioning
+  ## If undefined (the default) or set to null, no storageClassName spec is
+  ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
+  ##   GKE, AWS & OpenStack)
+  ##
+  # storageClass: "-"
+
+  ## Subdirectory of pushgateway data Persistent Volume to mount
+  ## Useful if the volume's root directory is not empty
+  ##
+  subPath: ""
+
+extraVolumes: []
+  # - name: extra
+  #   emptyDir: {}
+extraVolumeMounts: []
+  # - name: extra
+  #   mountPath: /usr/share/extras
+  #   readOnly: true
+
+# Configuration for clusters with restrictive network policies in place:
+# - allowAll allows access to the PushGateway from any namespace
+# - customSelector is a list of pod/namespaceSelectors to allow access from
+# These options are mutually exclusive and the latter will take precedence.
+networkPolicy: {}
+  # allowAll: true
+  # customSelectors:
+  #   - namespaceSelector:
+  #       matchLabels:
+  #         type: admin
+  #   - podSelector:
+  #       matchLabels:
+  #         app: myapp
+
+# Array of extra K8s objects to deploy (evaluated as a template)
+# The value can hold an array of strings as well as objects
+extraManifests: []
diff --git a/base-kustomize/prometheus/alerting_rules.yaml b/base-kustomize/prometheus/alerting_rules.yaml
index 5894c4f1..289a1aad 100644
--- a/base-kustomize/prometheus/alerting_rules.yaml
+++ b/base-kustomize/prometheus/alerting_rules.yaml
@@ -58,3 +58,42 @@ additionalPrometheusRulesMap:
           annotations:
             summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
             description: "Volume is almost full (< 20% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  backup-alerts:
+    groups:
+    - name: OVN backup alerts
+      rules:
+      - alert: ovnBackupUploadWarning
+        expr: time() - upload_pairs_success_timestamp{job="ovn-backup"} > 21600 # 21600s = 6 hours
+        # This bases the `expr` on the run normal interval, and allows `for`
+        # to account for the amount of time to complete and upload before
+        # alerting.
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: Last OVN backup not uploaded within 1 hour of scheduled run
+          description: "Last OVN backup not uploaded within 1 hour of scheduled run"
+      - alert: ovnBackupUploadCritical
+        expr: time() - upload_pairs_success_timestamp{job="ovn-backup"} > 43200
+        for: 1h
+        labels:
+          severity: critical
+        annotations:
+          summary: Second successive of OVN backup not uploaded within 1 hour of scheduled run
+          description: "Second successive of OVN backup not within 1 hour of scheduled run"
+      - alert: ovnBackupDiskUsageWarning
+        expr: disk_used_percent_gauge{job="ovn-backup"} > 80
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: OVN backup volume >= 80% disk usage
+          description: "OVN backup volume >= 80% disk usage"
+      - alert: ovnBackupDiskUsageCritical
+        expr: disk_used_percent_gauge{job="ovn-backup"} > 90
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: OVN backup volume >= 90% disk usage
+          description: "OVN backup volume >= 90% disk usage"
diff --git a/docs/monitoring-getting-started.md b/docs/monitoring-getting-started.md
index 52c8841e..6a971e2a 100644
--- a/docs/monitoring-getting-started.md
+++ b/docs/monitoring-getting-started.md
@@ -22,15 +22,16 @@ We can then deploy our visualization dashboard Grafana
 
 Grafana is used to visualize various metrics provided by the monitoring system as well as alerts and logs, take a look at the [Grafana](https://grafana.com/) documentation for more information
 
-## Install the metric exporters
+## Install the metric exporters and pushgateway
 
-Now let's deploy our exporters!
+Now let's deploy our exporters and pushgateway!
 
 * [Mysql Exporter](prometheus-mysql-exporter.md)
 * [RabbitMQ Exporter](prometheus-rabbitmq-exporter.md)
 * [Postgres Exporter](prometheus-postgres-exporter.md)
 * [Memcached Exporter](prometheus-memcached-exporter.md)
 * [Openstack Exporter](prometheus-openstack-metrics-exporter.md)
+* [Pushgateway](prometheus-pushgateway.md)
 
 ## Next steps
 
diff --git a/docs/prometheus-monitoring-overview.md b/docs/prometheus-monitoring-overview.md
index 4d0ad6ed..119e0756 100644
--- a/docs/prometheus-monitoring-overview.md
+++ b/docs/prometheus-monitoring-overview.md
@@ -18,6 +18,7 @@ Prometheus makes use of various metric exporters used to collect monitoring data
 * Postgres Exporter(Postgresql metrics)
 * Memcached Exporter(Memcached metrics)
 * Openstack Exporter(Metrics from various Openstack products)
+* Pushgateway (metrics from short-lived jobs)
 
 <figure markdown="span">
   ![Prometheus Monitoring Diagram](assets/images/prometheus-monitoring.png){ style="filter:drop-shadow(#3c3c3c 0.5rem 0.5rem 10px);" }
diff --git a/docs/prometheus-pushgateway.md b/docs/prometheus-pushgateway.md
new file mode 100644
index 00000000..b3814ad1
--- /dev/null
+++ b/docs/prometheus-pushgateway.md
@@ -0,0 +1,18 @@
+# Prometheus Pushgateway
+
+_Prometheus_ can use a _pushgateway_ to gather metrics from short-lived jobs, like
+Kubernetes _CronJobs_. The pushgateway stays up to allow _Promethus_ to gather
+the metrics. The short-lived job can push metrics to the gateway and terminate.
+
+In particular, _Genestack_ can use the _pushgateway_ to collect metrics from
+the OVN backup _CronJob_.
+
+#### Install the Prometheus Pushgateway Helm Chart
+
+
+``` shell
+kubectl kustomize --enable-helm /opt/genestack/base-kustomize/prometheus-pushgateway | kubectl apply -f -
+```
+
+!!! success
+    If the installation is successful, you should see the prometheus-pushgateway pod running in the prometheus namespace.
diff --git a/mkdocs.yml b/mkdocs.yml
index f7110fdf..5690991a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -200,6 +200,7 @@ nav:
               - Postgres Exporter: prometheus-postgres-exporter.md
               - Openstack Exporter: prometheus-openstack-metrics-exporter.md
               - Blackbox Exporter: prometheus-blackbox-exporter.md
+              - Pushgateway: prometheus-pushgateway.md
               - Custom Node Metrics: prometheus-custom-node-metrics.md
               - Alert Manager Examples:
                   - alertmanager-encore.md