From 40bdb624eef22137793fa8cad13f974ac9a1e89e Mon Sep 17 00:00:00 2001 From: Jimmy <5608027+orcutt989@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:38:25 -0400 Subject: [PATCH] snapshotEngine: retire snapshot warmer (#602) * sleep first * move snapshot creation to maker script * add nodes config to cm * rm warmer stuff * set snapshot info * update node class * retrofit maker to take snapshot * make sure to delete all snapshots before and after * only delete by history mode * sweet spot for vs cleanup --- .../snapshotEngine/scripts/snapshot-warmer.sh | 142 ------------------ .../snapshotEngine/templates/configmap.yaml | 3 +- .../templates/snapshot-warmer.yaml | 47 ------ snapshotEngine/snapshot-maker.sh | 112 +++++++++++--- 4 files changed, 89 insertions(+), 215 deletions(-) delete mode 100755 charts/snapshotEngine/scripts/snapshot-warmer.sh delete mode 100644 charts/snapshotEngine/templates/snapshot-warmer.yaml diff --git a/charts/snapshotEngine/scripts/snapshot-warmer.sh b/charts/snapshotEngine/scripts/snapshot-warmer.sh deleted file mode 100755 index 817e61c1e..000000000 --- a/charts/snapshotEngine/scripts/snapshot-warmer.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/sh - -cd / - -timestamp() { - date "+%Y-%m-%d %H:%M:%S" -} - -getSnapshotNames() { - local readyToUse="${1##readyToUse=}" - shift - if [ -z "$readyToUse" ]; then - echo "Error: No jsonpath for volumesnapshots' ready status was provided." - exit 1 - fi - kubectl get volumesnapshots -o jsonpath="{.items[?(.status.readyToUse==$readyToUse)].metadata.name}" --namespace "$NAMESPACE" "$@" -} - -getNumberOfSnapshots() { - local readyToUse="$1" - shift - getSnapshotNames "$readyToUse" -o go-template='{{ len .items }}' "$@" -} - -delete_old_volumesnapshots() { - local selector="${1##selector=}" - local max_snapshots="${2##max_snapshots=}" - - while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do - sleep 5 - NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector") - printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)" - SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector") - if ! kubectl delete volumesnapshots "${SNAPSHOTS%% *}" --namespace "${NAMESPACE}"; then - printf "%s ERROR deleting snapshot. ${SNAPSHOTS%% *}\n" "$(timestamp)" - fi - sleep 10 - done -} - -# delete_stuck_volumesnapshots() { -# snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}") -# arr=(`echo ${snapshot_list}`); -# for snapshot_name in "${arr[@]}"; do -# snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}') -# snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1} -# snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s) -# current_date_unix=$(date -u +%s) -# snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 )) -# # Snapshots should never be older than 6 minutes -# # If they are then there's a problem on AWS' end and the snapshot needs to be deleted. -# if [ $snapshot_age_minutes -ge 6 ]; then -# printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes" -# err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null) -# if [ $? -ne 0 ]; then -# printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name" -# printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err" -# sleep 10 -# exit 1 -# else -# printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name" -# fi -# fi -# done -# } - -HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")" -TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")" -PERSISTENT_VOLUME_CLAIM="$( - kubectl get po -n "$NAMESPACE" -l node_class="$NODE_CLASS" \ - -o jsonpath="{.items[0].spec.volumes[?(@.name==$TARGET_VOLUME)].persistentVolumeClaim.claimName}" -)" - -# For yq to work, the values resulting from the above cmds need to be exported. -# We don't export them inline because of -# https://github.com/koalaman/shellcheck/wiki/SC2155 -export HISTORY_MODE -export PERSISTENT_VOLUME_CLAIM - -yq e -i '.metadata.namespace=strenv(NAMESPACE)' createVolumeSnapshot.yaml -yq e -i '.metadata.labels.history_mode=strenv(HISTORY_MODE)' createVolumeSnapshot.yaml -yq e -i '.spec.source.persistentVolumeClaimName=strenv(PERSISTENT_VOLUME_CLAIM)' createVolumeSnapshot.yaml -yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolumeSnapshot.yaml - -while true; do - - # Pause if nodes are not ready - until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do - printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" - until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do - sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node - if [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then - break - fi - done - done - - # Remove unlabeled snapshots - delete_old_volumesnapshots selector='!history_mode' max_snapshots=0 - # Maintain 4 snapshots of a certain history mode - delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4 - # Check for and delete old stuck snapshots - # delete_stuck_volumesnapshots - - if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then - # EBS Snapshot name based on current time and date - current_date=$(date "+%Y-%m-%d-%H-%M-%S" "$@") - export SNAPSHOT_NAME="$current_date-$HISTORY_MODE-node-snapshot" - # Update volume snapshot name - yq e -i '.metadata.name=strenv(SNAPSHOT_NAME)' createVolumeSnapshot.yaml - - printf "%s Creating snapshot ${SNAPSHOT_NAME} in ${NAMESPACE}.\n" "$(timestamp)" - - start_time=$(date +%s) - - # Create snapshot - if ! kubectl apply -f createVolumeSnapshot.yaml; then - printf "%s ERROR creating volumeSnapshot ${SNAPSHOT_NAME} in ${NAMESPACE} .\n" "$(timestamp)" - exit 1 - fi - - sleep 5 - - # While no snapshots ready - while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do - printf "%s Snapshot is still creating...\n" "$(timestamp)" - sleep 10 - # delete_stuck_volumesnapshots - done - end_time=$(date +%s) - elapsed=$((end_time - start_time)) - printf "%s Snapshot ${SNAPSHOT_NAME} in ${NAMESPACE} finished." "$(timestamp)" - eval "echo Elapsed time: $(date -ud "@$elapsed" +'$((%s/3600/24)) days %H hr %M min %S sec')\n" - else - printf "%s Snapshot already in progress...\n" "$(timestamp)" - sleep 10 - # delete_stuck_volumesnapshots - fi - - printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)" - sleep 10m -done \ No newline at end of file diff --git a/charts/snapshotEngine/templates/configmap.yaml b/charts/snapshotEngine/templates/configmap.yaml index b8b2f9ec5..7e8f18111 100644 --- a/charts/snapshotEngine/templates/configmap.yaml +++ b/charts/snapshotEngine/templates/configmap.yaml @@ -15,7 +15,8 @@ data: SCHEMA_URL: {{ $.Values.schemaUrl }} S3_BUCKET: {{ $.Values.s3BucketOverride }} CLOUD_PROVIDER: {{ $.Values.cloudProvider }} - STORAGE_CLASS: {{$.Values.volumeSnapClass }} + STORAGE_CLASS: {{ $.Values.volumeSnapClass }} + NODES: {{ $.Values.nodes }} kind: ConfigMap metadata: name: snapshot-configmap diff --git a/charts/snapshotEngine/templates/snapshot-warmer.yaml b/charts/snapshotEngine/templates/snapshot-warmer.yaml deleted file mode 100644 index 2bf3271c0..000000000 --- a/charts/snapshotEngine/templates/snapshot-warmer.yaml +++ /dev/null @@ -1,47 +0,0 @@ -{{ define "snapshot-warmer.name" }} -{{- $history_mode := $.history_mode }} - {{- printf "%s-%s-%s" "snapshot-warmer-for" $history_mode "node" }} -{{- end }} - -{{- range $node, $config := .Values.nodes }} -{{- if $config }} - {{- $_ := set $ "history_mode" $config.history_mode }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "snapshot-warmer.name" $ }} - namespace: {{ $.Release.Namespace }} - labels: - app: {{ include "snapshot-warmer.name" $ }} -spec: - replicas: 1 - selector: - matchLabels: - app: {{ include "snapshot-warmer.name" $ }} - template: - metadata: - labels: - app: {{ include "snapshot-warmer.name" $ }} - spec: - serviceAccountName: {{ $.Values.service_account }} - containers: - - name: {{ include "snapshot-warmer.name" $ }} - image: {{ $.Values.tezos_k8s_images.snapshotEngine }} - imagePullPolicy: Always - command: ["/bin/bash"] - args: - - "-c" - - | -{{ $.Files.Get "scripts/snapshot-warmer.sh" | indent 14 }} - env: - - name: VOLUME_SNAPSHOT_CLASS - value: {{ $.Values.volumeSnapClass }} - - name: NAMESPACE - value: {{ $.Release.Namespace }} - - name: NODE_CLASS - value: {{ $node }} - - name: NODE_CONFIG - value: {{ $config | toJson | quote }} ---- -{{- end }} -{{- end }} diff --git a/snapshotEngine/snapshot-maker.sh b/snapshotEngine/snapshot-maker.sh index f5dd717b3..f92823c5f 100755 --- a/snapshotEngine/snapshot-maker.sh +++ b/snapshotEngine/snapshot-maker.sh @@ -1,10 +1,58 @@ #!/bin/bash +# Delete all volumesnapshots so they arent setting around accruing charges +kubectl delete vs -l history_mode=$HISTORY_MODE + +PERSISTENT_VOLUME_CLAIM="var-volume-snapshot-${HISTORY_MODE}-node-0" + +# For yq to work, the values resulting from the above cmds need to be exported. +# We don't export them inline because of +# https://github.com/koalaman/shellcheck/wiki/SC2155 +export HISTORY_MODE +export PERSISTENT_VOLUME_CLAIM + +yq e -i '.metadata.namespace=strenv(NAMESPACE)' createVolumeSnapshot.yaml +yq e -i '.metadata.labels.history_mode=strenv(HISTORY_MODE)' createVolumeSnapshot.yaml +yq e -i '.spec.source.persistentVolumeClaimName=strenv(PERSISTENT_VOLUME_CLAIM)' createVolumeSnapshot.yaml +yq e -i '.spec.volumeSnapshotClassName=strenv(STORAGE_CLASS)' createVolumeSnapshot.yaml + +# Returns list of snapshots with a given status +# readyToUse true/false +getSnapshotNames() { + local readyToUse="${1##readyToUse=}" + shift + if [ -z "$readyToUse" ]; then + echo "Error: No jsonpath for volumesnapshots' ready status was provided." + exit 1 + fi + kubectl get volumesnapshots -o jsonpath="{.items[?(.status.readyToUse==$readyToUse)].metadata.name}" --namespace "$NAMESPACE" "$@" +} + +# SLEEP_TIME=0m + +# if [ "${HISTORY_MODE}" = "archive" ]; then +# SLEEP_TIME="${ARCHIVE_SLEEP_DELAY}" +# if [ "${ARCHIVE_SLEEP_DELAY}" != "0m" ]; then +# printf "%s artifactDelay.archive is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ARCHIVE_SLEEP_DELAY}" +# fi +# elif [ "${HISTORY_MODE}" = "rolling" ]; then +# SLEEP_TIME="${ROLLING_SLEEP_DELAY}" +# if [ "${ROLLING_SLEEP_DELAY}" != "0m" ]; then +# printf "%s artifactDelay.rolling is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ROLLING_SLEEP_DELAY}" +# fi +# fi + +# if [ "${SLEEP_TIME}" = "0m" ]; then +# printf "%s artifactDelay.HISTORY_MODE was not set! No delay...\n" "$(date "+%Y-%m-%d %H:%M:%S")" +# fi + +# sleep "${SLEEP_TIME}" + cd / ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}" -# Delete zip-and-upload job +# Delete zip-and-upload job if still around if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then printf "%s Old zip-and-upload job exits. Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" if ! kubectl delete jobs "${ZIP_AND_UPLOAD_JOB_NAME}"; then @@ -16,7 +64,7 @@ else printf "%s No old zip-and-upload job detected for cleanup.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" fi -# Delete old PVCs +# Delete old PVCs if still around if [ "${HISTORY_MODE}" = rolling ]; then if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -40,12 +88,34 @@ if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then sleep 5 fi -# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do -# printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" -# sleep 10 -# done +# Take volume snapshot +current_date=$(date "+%Y-%m-%d-%H-%M-%S" "$@") +export SNAPSHOT_NAME="$current_date-$HISTORY_MODE-node-snapshot" +# Update volume snapshot name +yq e -i '.metadata.name=strenv(SNAPSHOT_NAME)' createVolumeSnapshot.yaml -printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" +printf "%s Creating snapshot ${SNAPSHOT_NAME} in ${NAMESPACE}.\n" "$(timestamp)" + +# Create snapshot +if ! kubectl apply -f createVolumeSnapshot.yaml; then + printf "%s ERROR creating volumeSnapshot ${SNAPSHOT_NAME} in ${NAMESPACE} .\n" "$(timestamp)" + exit 1 +fi + +sleep 5 + +# Wait for snapshot to finish +until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do + printf "%s Snapshot in progress. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" + until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do + sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node + if [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; then + break + fi + done +done + +printf "%s Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" SNAPSHOTS=$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==true)].metadata.name}' -l history_mode="${HISTORY_MODE}") NEWEST_SNAPSHOT=${SNAPSHOTS##* } @@ -126,6 +196,11 @@ then exit 1 fi +sleep 5 + +# Delete all volumesnapshots so they arent setting around accruing charges +kubectl delete vs -l history_mode=$HISTORY_MODE + # TODO Check for PVC printf "%s PersistentVolumeClaim ${HISTORY_MODE}-snap-volume created successfully in namespace ${NAMESPACE}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -183,23 +258,6 @@ if [ "${HISTORY_MODE}" = archive ]; then yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml fi -# # Switch alternate cloud provider secret name based on actual cloud provider -# if [[ -n "${CLOUD_PROVIDER}" ]]; then -# # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted. -# SECRET_NAME="${NAMESPACE}-secret" -# # Index of zip-and-upload container changes depending on if rolling job or archive job -# NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml) -# # Index of mounts also changes depending on history mode -# NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml ) -# # Secret volume mount is last item in list of volumeMounts for the zip and upload container -# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml -# # Index of job volumes change depending on history mode -# NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml ) -# # Setting job secret volume to value set by workflow -# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml -# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml -# fi - # Service account to be used by entire zip-and-upload job. SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml @@ -218,7 +276,7 @@ sleep 20 while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do - sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job + sleep 2m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \ [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" @@ -269,6 +327,9 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES sleep 5 fi + # Delete all volumesnapshots so they arent setting around accruing charges + kubectl delete vs -l history_mode=$HISTORY_MODE + SLEEP_TIME=0m if [ "${HISTORY_MODE}" = "archive" ]; then @@ -296,3 +357,4 @@ sleep 5 kubectl delete -f volumeFromSnap.yaml | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done sleep 5 kubectl delete job snapshot-maker --namespace "${NAMESPACE}" +