Skip to content

Commit

Permalink
fix: backup in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
mrekucci committed Sep 4, 2024
1 parent 39d5aaa commit 4e26f11
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 38 deletions.
52 changes: 23 additions & 29 deletions infrastructure/nomad/playbooks/destroy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@

- name: Stop Jobs
ansible.builtin.shell: |
NOMAD_JOBS=$(nomad job status -json | jq -r '.[].Summary.JobID')
NOMAD_JOBS=$(nomad job status -json | jq -r 'sort_by(.Allocations[0].CreateTime) | reverse | .[].Summary.JobID')
for job in $(echo "${NOMAD_JOBS}" | grep -v artifacts); do
if [ "${job}" != "null" ]; then
nomad stop "${job}"
TIMEOUT=600
while true; do
STATUS=$(nomad job status -json "${job}" | jq -r '.[0].Allocations[0].ClientStatus')
case "${STATUS}" in
Expand All @@ -29,11 +28,6 @@
;;
*)
sleep 1
TIMEOUT=$((TIMEOUT - 1))
if [ "${TIMEOUT}" -eq 0 ]; then
echo "Timeout waiting for ${job} to stop has been exceeded."
return 1
fi
;;
esac
done
Expand All @@ -47,31 +41,31 @@
executable: bash
when: backup is defined and backup

- name: Purge Stopped Jobs
ansible.builtin.shell: |
TIMEOUT=30
while [ "$(nomad job status)" != "No running jobs" ]; do
RUNNING_JOBS=$(nomad job status -json | jq '[.[] | select(.Summary.Summary | to_entries[] | select(.value.Running > 0 or .value.Starting > 0)) | .Summary.JobID]')
if [ "${RUNNING_JOBS}" = "[]" ]; then
break
fi
sleep 1
TIMEOUT=$((TIMEOUT - 1))
if [ "${TIMEOUT}" -eq 0 ]; then
echo "Timeout waiting for jobs to stop has been exceeded."
return 1
fi
done
nomad var purge {{ nomad_vars_path }}
nomad system gc
args:
executable: bash
when: backup is defined and backup
# - name: Purge Stopped Jobs
# ansible.builtin.shell: |
# TIMEOUT=30
# while [ "$(nomad job status)" != "No running jobs" ]; do
# RUNNING_JOBS=$(nomad job status -json | jq '[.[] | select(.Summary.Summary | to_entries[] | select(.value.Running > 0 or .value.Starting > 0))] | sort_by(.Allocations[0].CreateTime) | reverse | .[].Summary.JobID')
# if [ "${RUNNING_JOBS}" = "[]" ]; then
# break
# fi
# sleep 1
# TIMEOUT=$((TIMEOUT - 1))
# if [ "${TIMEOUT}" -eq 0 ]; then
# echo "Timeout waiting for jobs to stop has been exceeded."
# return 1
# fi
# done
#
# nomad var purge {{ nomad_vars_path }}
# nomad system gc
# args:
# executable: bash
# when: backup is defined and backup

- name: Force Stop and Purge Jobs
ansible.builtin.shell: |
for job in $(nomad job status -json | jq -r '.[].Summary.JobID'); do
for job in $(nomad job status -json | jq -r 'sort_by(.Allocations[0].CreateTime) | reverse | .[].Summary.JobID'); do
if [ "${job}" != "null" ]; then
nomad stop -purge "${job}"
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ job "{{ job.name }}" {

{% if env == 'testenv' %}
resources {
cores = 2
memory = 16384
memory = 2048
}
{% endif %}

Expand Down Expand Up @@ -150,16 +149,17 @@ job "{{ job.name }}" {

echo "Restoring from backup file ${BACKUP_FILE} to ${GETH_DATA_DIR}"
echo "Backup file size: $(du -h ${BACKUP_FILE} | cut -f1)"
START_TIME=$(date +%s)
local/mev-commit-geth \
--cache=4096 \
--verbosity=5 \
--log.format="${GETH_LOG_FORMAT}" \
--log.tags="${GETH_LOG_TAGS}" \
--datadir="${GETH_DATA_DIR}" \
import ${BACKUP_FILE}

if [[ $? -eq 0 ]]; then
echo "Restore successful"
ELAPSED_TIME=$(($(date +%s) - START_TIME))
echo "Restore finished in: $(date -u -d@${ELAPSED_TIME} +%H:%M:%S)"
else
echo "Restore failed"
exit 1
Expand Down Expand Up @@ -325,8 +325,7 @@ job "{{ job.name }}" {

{% if env == 'testenv' %}
resources {
cores = 2
memory = 16384
memory = 2048
}
{% endif %}

Expand Down Expand Up @@ -386,7 +385,7 @@ job "{{ job.name }}" {
fi

BACKUP_FILE="local/backups/{{ version }}_{{ job.name }}-{% raw %}{{ env "NOMAD_ALLOC_INDEX" }}{% endraw %}_$(date +%Y%m%d%H%M%S)"
STATUS=$(nomad alloc status -address="http://{{ ansible_facts['default_ipv4']['address'] }}:4646" -json "${NOMAD_ALLOC_ID}")
STATUS=$(nomad alloc status -address="http://127.0.0.1:4646" -json "${NOMAD_ALLOC_ID}")
NON_ZERO_EXIT_EVENTS=$(echo "$STATUS" | jq -r '.TaskStates.node.Events[] | select(.ExitCode != 0)')
if [[ -n "${NON_ZERO_EXIT_EVENTS}" ]]; then
echo "The main task did not start or finish gracefully"
Expand All @@ -395,17 +394,18 @@ job "{{ job.name }}" {
BACKUP_FILE+=".rlp"

echo "Exporting chain data from ${GETH_DATA_DIR} to backup file: ${BACKUP_FILE}"
START_TIME=$(date +%s)
chmod +x local/mev-commit-geth
local/mev-commit-geth \
--cache 4096 \
--verbosity=5 \
--log.format="$GETH_LOG_FORMAT" \
--log.tags="$GETH_LOG_TAGS" \
--datadir="${GETH_DATA_DIR}" \
export ${BACKUP_FILE}

if [[ "$?" -eq 0 ]] && [[ -f "${BACKUP_FILE}" ]]; then
echo "Backup successful"
ELAPSED_TIME=$(($(date +%s) - START_TIME))
echo "Backup finished in: $(date -u -d@${ELAPSED_TIME} +%H:%M:%S)"
echo "Backup file size: $(du -h ${BACKUP_FILE} | cut -f1)"
else
echo "Backup failed"
Expand Down

0 comments on commit 4e26f11

Please sign in to comment.