Skip to content

Commit

Permalink
feat: add Prometheus pushgateway and metric collection for OVN backup (
Browse files Browse the repository at this point in the history
…rackerlabs#381)

* feat: add Prometheus pushgateway and metric collection for OVN backup

JIRA:OSPC-551

* Add disk percent usage gauge to OVN backup metrics.

* Add alerting rules based on collected OVN backup metrics.

* typo correction for ovn backup alert

* Put prometheus-pushgateway.md in mkdocs.yml
  • Loading branch information
awfabian-rs authored Aug 2, 2024
1 parent 1313e1a commit f9f6337
Show file tree
Hide file tree
Showing 9 changed files with 597 additions and 13 deletions.
5 changes: 5 additions & 0 deletions base-kustomize/ovn/ovn-backup/ovn-backup.config
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ BACKUP_DIR=/backup
LOG_FILE=/backup/upload.log
LOG_LEVEL=INFO

# Upload metrics to Prometheus
PROMETHEUS_PUSHGATEWAY_URL=http://prometheus-pushgateway.prometheus.svc.cluster.local:9091
PROMETHEUS_JOB_NAME=ovn-backup
PROMETHEUS_UPLOAD=false

# From here forward, variables for uploading to Swift with tempauth
SWIFT_TEMPAUTH_UPLOAD=false
# If you change this to "true", set the variables in swift-tempauth.env
Expand Down
167 changes: 156 additions & 11 deletions base-kustomize/ovn/ovn-backup/ovn-backup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ log_level() {
;;
esac
}
export -f log_level

log_line() {
local LEVEL
Expand All @@ -40,7 +39,116 @@ log_line() {
echo "$line" | tee -a "$LOG_FILE"
fi
}
export -f log_line # exported for upload_file

# Stats files init. These mostly get used to send to Prometheus, but you could
# just read them if you want to.

STATS_DIR="${BACKUP_DIR}/stats"

[[ -d "$STATS_DIR" ]] || mkdir "$STATS_DIR"

declare -A metric_types=(
["run_count"]="counter"
["run_timestamp"]="counter"
["save_pairs_to_disk_success_count"]="counter"
["save_pairs_to_disk_success_timestamp"]="counter"
["save_pairs_to_disk_failure_count"]="counter"
["save_pairs_to_disk_failure_timestamp"]="counter"
["upload_attempt_count"]="counter"
["upload_attempt_timestamp"]="counter"
["upload_pairs_success_count"]="counter"
["upload_pairs_success_timestamp"]="counter"
["upload_pairs_failure_count"]="counter"
["upload_pairs_failure_timestamp"]="counter"
["disk_files_gauge"]="gauge"
["disk_used_percent_gauge"]="gauge"
["swift_objects_gauge"]="gauge"
)

# Initialize metrics/stats files with 0 if they don't exist
for metric_filename in "${!metric_types[@]}"
do
metric_file_fullname="${STATS_DIR}/$metric_filename"
[[ -e "$metric_file_fullname" ]] || echo "0" > "$metric_file_fullname"
done

# get_metric takes the metric name, reads the metric file, and echos the value
get_metric() {
local STAT_NAME
local STAT_FULL_FILENAME
STAT_NAME="$1"
STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME"
VALUE="$(cat "$STAT_FULL_FILENAME")"
echo "$VALUE"
}

# update count $1: stat name, $2 new value
# Used for updating disk file count and Cloud Files object counts.
update_metric() {
local STAT_NAME
local VALUE
STAT_NAME="$1"
VALUE="$2"
STAT_FULL_FILENAME="${STATS_DIR}/$STAT_NAME"
echo "$VALUE" > "$STAT_FULL_FILENAME"
}

# increment increments a stats counter $1 by 1
increment() {
local VALUE
local METRIC_NAME
METRIC_NAME="$1"
VALUE="$(get_metric "$METRIC_NAME")"
((VALUE++))
update_metric "$METRIC_NAME" "$VALUE"
}

# Save epoch time to metric $1
timestamp_metric() {
local METRIC_NAME
METRIC_NAME="$1"
update_metric "$METRIC_NAME" "$(date +%s)"
}

increment run_count
timestamp_metric run_timestamp

finalize_and_upload_metrics() {
local FILE_COUNT
FILE_COUNT=$(find "$BACKUP_DIR" -name \*.backup | wc -l)
update_metric disk_files_gauge "$FILE_COUNT"
local DISK_PERCENT_USED
DISK_PERCENT_USED=$(df "$BACKUP_DIR" | perl -lane 'next unless $. == 2; print int($F[4])')
update_metric disk_used_percent_gauge "$DISK_PERCENT_USED"
local OBJECT_COUNT
if [[ "$SWIFT_TEMPAUTH_UPLOAD" == "true" ]]
then
OBJECT_COUNT=$($SWIFT stat "$CONTAINER" | awk '/Objects:/ { print $2 }')
update_metric swift_objects_gauge "$OBJECT_COUNT"
fi

if [[ "$PROMETHEUS_UPLOAD" != "true" ]]
then
exit 0
fi

for metric in "${!metric_types[@]}"
do
echo "# TYPE $metric ${metric_types[$metric]}
$metric{label=\"ovn-backup\"} $(get_metric "$metric")" | \
curl -sS \
"$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/$PROMETHEUS_JOB_NAME" \
--data-binary @-
done

# Put metrics in the log if running at DEBUG level.
perl -ne 'print "$ARGV $_"' /backup/stats/* | cut -d / -f 4 | \
while read -r read_metric
do
log_line DEBUG "run end metric $read_metric"
done
}
trap finalize_and_upload_metrics EXIT INT TERM HUP

# Delete old backup files on volume.
cd "$BACKUP_DIR" || exit 2
Expand All @@ -51,8 +159,29 @@ find "$BACKUP_DIR" -ctime +"$RETENTION_DAYS" -delete;
YMD="$(date +"%Y/%m/%d")"
# kubectl-ko creates backups in $PWD, so we cd first.
mkdir -p "$YMD" && cd "$YMD" || exit 2
/kube-ovn/kubectl-ko nb backup || log_line ERROR "nb backup failed"
/kube-ovn/kubectl-ko sb backup || log_line ERROR "sb backup failed"

# This treats the saved failed and success count as a single metric for both
# backups; if either one fails, we increment the failure count, otherwise,
# the success count.
FAILED=false
if ! /kube-ovn/kubectl-ko nb backup
then
log_line ERROR "nb backup failed"
FAILED=true
fi
if ! /kube-ovn/kubectl-ko sb backup
then
log_line ERROR "sb backup failed"
FAILED=true
fi
if [[ "$FAILED" == "true" ]]
then
increment save_pairs_to_disk_failure_count
timestamp_metric save_pairs_to_disk_failure_timestamp
else
increment save_pairs_to_disk_success_count
timestamp_metric save_pairs_to_disk_success_timestamp
fi

if [[ "$SWIFT_TEMPAUTH_UPLOAD" != "true" ]]
then
Expand All @@ -63,11 +192,13 @@ fi

cd "$BACKUP_DIR" || exit 2

increment upload_attempt_count
timestamp_metric upload_attempt_timestamp

# Make a working "swift" command
SWIFT="kubectl -n openstack exec -i openstack-admin-client --
env -i ST_AUTH=$ST_AUTH ST_USER=$ST_USER ST_KEY=$ST_KEY
/var/lib/openstack/bin/swift"
export SWIFT

# Create the container if it doesn't exist
if ! $SWIFT stat "$CONTAINER" > /dev/null
Expand All @@ -84,16 +215,30 @@ upload_file() {
OBJECT_NAME="$FILE"
if $SWIFT upload "$CONTAINER" --object-name "$OBJECT_NAME" - < "$FILE"
then
log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME"
log_line INFO "SUCCESSFUL UPLOAD $FILE as object $OBJECT_NAME to container $CONTAINER"
else
log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME"
log_line ERROR "FAILURE API swift exited $? uploading $FILE as $OBJECT_NAME to container $CONTAINER"
FAILED_UPLOAD=true
fi
}
export -f upload_file

# find created backups and upload them
cd "$BACKUP_DIR" || exit 2
# unusual find syntax to use an exported function from the shell
find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" \
-exec bash -c 'upload_file "$0"' {} \;

FAILED_UPLOAD=false
find "$YMD" -type f -newer "$BACKUP_DIR/last_upload" | \
while read -r file
do
upload_file "$file"
done

if [[ "$FAILED_UPLOAD" == "true" ]]
then
increment upload_pairs_failure_count
timestamp_metric upload_pairs_failure_timestamp
else
increment upload_pairs_success_count
timestamp_metric upload_pairs_success_timestamp
fi

touch "$BACKUP_DIR/last_upload"
7 changes: 7 additions & 0 deletions base-kustomize/prometheus-pushgateway/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
helmCharts:
- name: prometheus-pushgateway
repo: https://prometheus-community.github.io/helm-charts
releaseName: prometheus-pushgateway
namespace: prometheus
includeCRDs: true
valuesFile: values.yaml
Loading

0 comments on commit f9f6337

Please sign in to comment.