Skip to content

Commit

Permalink
feat: enable backup/restore of geth chaindata
Browse files Browse the repository at this point in the history
  • Loading branch information
mrekucci committed Aug 27, 2024
1 parent 8080525 commit 6441dae
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 18 deletions.
17 changes: 12 additions & 5 deletions infrastructure/nomad/cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ no_logs_collection_flag=false
force_build_templates_flag=false
skip_certificates_setup_flag=false
release_flag=false
backup_flag=false
deploy_version="HEAD"
environment_name="devenv"
profile_name="devnet"
Expand All @@ -21,7 +22,7 @@ help() {
echo "Usage:"
echo "$0 [init [--environment <name=devenv>] [--skip-certificates-setup] [--debug]]"
echo "$0 [deploy [version=HEAD] [--environment <name=devenv>] [--profile <name=devnet>] [--force-build-templates] [--no-logs-collection] [--datadog-key <key>] [--l1-rpc-url <url>] [--otel-collector-endpoint-url <url>] [--release] [--debug]]"
echo "$0 [destroy [--debug]] [--help]"
echo "$0 [destroy [--backup] [--debug]]"
echo "$0 --help"
echo
echo "Parameters:"
Expand All @@ -42,7 +43,8 @@ help() {
echo " --debug Enable debug mode for detailed output."
echo
echo " destroy Destroy the whole cluster."
echo " --debug Enable debug mode for detailed output."
echo " --backup Create a backup before destroying the environment."
echo " --debug Enable debug mode for detailed output."
echo
echo " --help Display this help message."
echo
Expand Down Expand Up @@ -71,16 +73,16 @@ help() {
echo " Deploy with a specific version, environment, profile in debug mode with disabled logs collection, Datadog API key, L1 RPC URL, and OpenTememetry Collector Endpoint URL:"
echo " $0 deploy v0.1.0 --environment devenv --profile testnet --no-logs-collection --datadog-key your_datadog_key --l1-rpc-url your_rpc_url --otel-collector-endpoint-url your_otel_url --debug"
echo
echo " Destroy with specific environment and debug mode:"
echo " $0 destroy --environment devenv --debug"
echo " Destroy all jobs but backup before do so:"
echo " $0 destroy --backup --debug"
exit 1
}

usage() {
echo "Usage:"
echo "$0 [init [--environment <name=devenv>] [--skip-certificates-setup] [--debug]]"
echo "$0 [deploy [version=HEAD] [--environment <name=devenv>] [--profile <name=devnet>] [--force-build-templates] [--no-logs-collection] [--datadog-key <key>] [--l1-rpc-url <url>] [--otel-collector-endpoint-url <url>] [--release] [--debug]]"
echo "$0 [destroy [--debug]] [--help]"
echo "$0 [destroy [--backup] [--debug]]"
echo "$0 --help"
exit 1
}
Expand Down Expand Up @@ -243,6 +245,10 @@ parse_args() {
destroy)
destroy_flag=true
shift
if [[ $# -gt 0 && $1 == "--backup" ]]; then
backup_flag=true
shift
fi
if [[ $# -gt 0 && $1 == "--debug" ]]; then
debug_flag=true
shift
Expand Down Expand Up @@ -288,6 +294,7 @@ main() {
;;
"${destroy_flag}")
playbook+="destroy.yml"
[[ "${backup_flag}" == true ]] && flags+=("--extra-vars" "backup=true")
;;
*)
usage
Expand Down
21 changes: 16 additions & 5 deletions infrastructure/nomad/playbooks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
when: version is not defined or version == '' or release

- name: Set Artifacts Build Version
set_fact:
ansible.builtin.set_fact:
build_artifacts: true
version: "{{ artifacts_build_version.stdout }}"
when: version is not defined or version == '' or release
Expand Down Expand Up @@ -124,11 +124,11 @@
success_msg: "The profile name is set to: {{ profile }}."

- name: Set Jobs Definition
set_fact:
ansible.builtin.set_fact:
jobs: "{{ profiles[profile].jobs }}"

- name: Disable Logs Collection
set_fact:
ansible.builtin.set_fact:
jobs: >-
{{
jobs
Expand All @@ -138,7 +138,7 @@
when: no_logs_collection | default(false) | bool

- name: Disable OpenTelemetry Trace Collection
set_fact:
ansible.builtin.set_fact:
jobs: >-
{{
jobs
Expand Down Expand Up @@ -166,7 +166,7 @@
register: existing_environment

- name: Set Existing Scripts Artifact Version as Stale
set_fact:
ansible.builtin.set_fact:
build_templates: >-
{{
(existing_environment.stdout | from_json) != environments[env]
Expand All @@ -187,6 +187,16 @@
Build Templates: {{ 'yes' if build_templates | default(false) else 'no' }}
tasks:
- name: Determine Cluster Status
ansible.builtin.shell: |
STATUS=$(nomad job status -json)
if [ "${STATUS}" != "No running jobs" ]; then
echo "Cluster has running jobs."
exit 1
fi
args:
executable: bash

- name: Build keystore-generator
ansible.builtin.shell: |
BINARY_PATH="{{ dist_dir }}/keystore-generator-{{ environments[env].version }}"
Expand Down Expand Up @@ -654,6 +664,7 @@
ansible.builtin.shell: |
nomad var purge "nomad/jobs"
nomad system gc
nomad var put -namespace=default "nomad/jobs" MEV_COMMIT_GETH_CHAIN_BACKUP="false"
args:
executable: bash

Expand Down
57 changes: 52 additions & 5 deletions infrastructure/nomad/playbooks/destroy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,67 @@
hosts: nomad_clients
gather_facts: no

vars:
nomad_vars_path: "nomad/jobs"

tasks:
- name: Stop and Purge Jobs
- name: Set Backup Var
ansible.builtin.shell: |
nomad var put -force -namespace=default {{ nomad_vars_path }} MEV_COMMIT_GETH_CHAIN_BACKUP="true"
args:
executable: bash
when: backup is defined and backup

- name: Stop Jobs
ansible.builtin.shell: |
for job in $(nomad job status -json | jq -r '.[].Summary.JobID'); do
NOMAD_JOBS=$(nomad job status -json | jq -r '.[].Summary.JobID')
for job in $(echo "${NOMAD_JOBS}" | grep -v artifacts); do
if [ "${job}" != "null" ]; then
nomad stop -purge "${job}"
nomad stop "${job}"
TIMEOUT=60
while true; do
STATUS=$(nomad job status -json "${job}" | jq -r '.[0].Allocations[0].ClientStatus')
case "${STATUS}" in
"failed" | "complete")
break
;;
*)
sleep 1
TIMEOUT=$((TIMEOUT - 1))
if [ "${TIMEOUT}" -eq 0 ]; then
echo "Timeout waiting for ${job} to stop has been exceeded."
return 1
fi
;;
esac
done
fi
done
if echo "${NOMAD_JOBS}" | grep -q artifacts; then
nomad stop -yes "artifacts"
fi
args:
executable: bash

- name: Purge Cluster
- name: Purge Stopped Jobs
ansible.builtin.shell: |
nomad var purge "nomad/jobs"
TIMEOUT=30
while [ "$(nomad job status)" != "No running jobs" ]; do
RUNNING_JOBS=$(nomad job status -json | jq '[.[] | select(.Summary.Summary | to_entries[] | select(.value.Running > 0 or .value.Starting > 0)) | .Summary.JobID]')
if [ "${RUNNING_JOBS}" = "[]" ]; then
break
fi
sleep 1
TIMEOUT=$((TIMEOUT - 1))
if [ "${TIMEOUT}" -eq 0 ]; then
echo "Timeout waiting for jobs to stop has been exceeded."
return 1
fi
done
nomad var purge {{ nomad_vars_path }}
nomad system gc
args:
executable: bash
9 changes: 9 additions & 0 deletions infrastructure/nomad/playbooks/init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@
become: true
become_user: "{{ ansible_user }}"

- name: Ensure "/tmp/{{ env }}" Directory Exists
ansible.builtin.file:
path: "/tmp/{{ env }}"
state: directory
mode: "0744"
recurse: yes
become: true
become_user: "{{ ansible_user }}"

tasks:
- name: Add DataDog Repository Key
ansible.builtin.apt_key:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,15 @@ job "{{ job.name }}" {
{% endfor %}
}

volume "tmp-volume" {
type = "host"
source = "tmp-volume"
read_only = false
}

task "node" {
driver = "exec"
kill_timeout = "25s"

{% if profile == 'testnet' %}
resources {
Expand Down Expand Up @@ -79,7 +86,7 @@ job "{{ job.name }}" {
template {
data = <<-EOH
{%- raw %}
GETH_DATA_DIR="local/data-{{ env "NOMAD_ALLOC_INDEX" }}"
GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}"
{% endraw %}
GENESIS_L1_PATH="local/genesis_{{ version }}.json"
GETH_BIN_PATH="local/mev-commit-geth"
Expand All @@ -102,7 +109,7 @@ job "{{ job.name }}" {
{% endraw %}
{% elif job.env['type'] == 'signer' %}
{%- raw %}
GETH_KEYSTORE_DIR="/local/data-{{ env "NOMAD_ALLOC_INDEX" }}/keystore"
GETH_KEYSTORE_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}/keystore"
GETH_KEYSTORE_FILENAME="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_filename }}{{ end }}"
GETH_KEYSTORE_PASSWORD="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_password }}{{ end }}"
{% endraw %}
Expand Down Expand Up @@ -150,7 +157,94 @@ job "{{ job.name }}" {
{% endif %}

chmod +x local/mev-commit-geth local/entrypoint.sh
local/entrypoint.sh
exec local/entrypoint.sh
EOH
destination = "local/run.sh"
perms = "0755"
}

config {
command = "bash"
args = ["-c", "exec local/run.sh"]
}
}

task "backup" {
driver = "exec"

lifecycle {
hook = "poststop"
}

{% if env == 'testenv' %}
resources {
cores = 6
memory = 32768
}
{% endif %}

volume_mount {
volume = "tmp-volume"
destination = "/local/backups"
read_only = false
}

{% if env != 'devenv' %}
artifact {
source = "https://primev-infrastructure-artifacts.s3.us-west-2.amazonaws.com/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz"
}
{% else %}
artifact {
source = "http://{{ ansible_facts['default_ipv4']['address'] }}:1111/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz"
}
{% endif %}

template {
data = <<-EOH
{%- raw %}
GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}"
{% endraw %}
EOH
destination = "secrets/.env"
env = true
}

template {
data = <<-EOH
#!/usr/bin/env bash

{% raw %}
{{ with nomadVar "nomad/jobs" }}
BACKUP="{{ .MEV_COMMIT_GETH_CHAIN_BACKUP }}"
{{ end }}
{% endraw %}

if [[ "${BACKUP}" != "true" ]]; then
echo "Backup not requested, skipping..."
exit 0
fi

BACKUP_FILE="local/backups/{{ version }}_{{ job.name }}-{% raw %}{{ env "NOMAD_ALLOC_INDEX" }}{% endraw %}_$(date +%Y%m%d%H%M%S)"
STATUS=$(nomad alloc status -address="http://{{ ansible_facts['default_ipv4']['address'] }}:4646" -json "${NOMAD_ALLOC_ID}")
NON_ZERO_EXIT_EVENTS=$(echo "$STATUS" | jq -r '.TaskStates.node.Events[] | select(.ExitCode != 0)')
if [[ -n "${NON_ZERO_EXIT_EVENTS}" ]]; then
echo "The main task did not start or finish gracefully."
BACKUP_FILE+="-dirty"
fi
BACKUP_FILE+=".tar.gz"

chmod +x local/mev-commit-geth
local/mev-commit-geth export \
--cache 4096 \
--syncmode "full" \
--datadir ${GETH_DATA_DIR} \
${BACKUP_FILE}

if [[ -f "${BACKUP_FILE}" ]]; then
echo "Backup successful."
else
echo "Backup failed."
fi
EOH
destination = "local/run.sh"
perms = "0755"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ client {
artifact {
decompression_file_count_limit = 8192
}
host_volume "tmp-volume" {
path = "/tmp/{{ env }}"
}
{% if env == "devenv" %}
host_volume "artifacts-volume" {
path = "{{ ansible_user_home }}/{{ env }}/artifacts"
Expand Down

0 comments on commit 6441dae

Please sign in to comment.