From 6441dae46becd5655ee46377a9be708279e4cf2e Mon Sep 17 00:00:00 2001 From: mrekucci Date: Fri, 23 Aug 2024 17:56:31 +0200 Subject: [PATCH] feat: enable backup/restore of geth chaindata --- infrastructure/nomad/cluster.sh | 17 ++- infrastructure/nomad/playbooks/deploy.yml | 21 +++- infrastructure/nomad/playbooks/destroy.yml | 57 +++++++++- infrastructure/nomad/playbooks/init.yml | 9 ++ .../templates/jobs/mev-commit-geth.nomad.j2 | 100 +++++++++++++++++- .../playbooks/templates/services/nomad.hcl.j2 | 3 + 6 files changed, 189 insertions(+), 18 deletions(-) diff --git a/infrastructure/nomad/cluster.sh b/infrastructure/nomad/cluster.sh index 605520fc7..afab4c8ae 100755 --- a/infrastructure/nomad/cluster.sh +++ b/infrastructure/nomad/cluster.sh @@ -10,6 +10,7 @@ no_logs_collection_flag=false force_build_templates_flag=false skip_certificates_setup_flag=false release_flag=false +backup_flag=false deploy_version="HEAD" environment_name="devenv" profile_name="devnet" @@ -21,7 +22,7 @@ help() { echo "Usage:" echo "$0 [init [--environment ] [--skip-certificates-setup] [--debug]]" echo "$0 [deploy [version=HEAD] [--environment ] [--profile ] [--force-build-templates] [--no-logs-collection] [--datadog-key ] [--l1-rpc-url ] [--otel-collector-endpoint-url ] [--release] [--debug]]" - echo "$0 [destroy [--debug]] [--help]" + echo "$0 [destroy [--backup] [--debug]]" echo "$0 --help" echo echo "Parameters:" @@ -42,7 +43,8 @@ help() { echo " --debug Enable debug mode for detailed output." echo echo " destroy Destroy the whole cluster." - echo " --debug Enable debug mode for detailed output." + echo " --backup Create a backup before destroying the environment." + echo " --debug Enable debug mode for detailed output." echo echo " --help Display this help message." echo @@ -71,8 +73,8 @@ help() { echo " Deploy with a specific version, environment, profile in debug mode with disabled logs collection, Datadog API key, L1 RPC URL, and OpenTememetry Collector Endpoint URL:" echo " $0 deploy v0.1.0 --environment devenv --profile testnet --no-logs-collection --datadog-key your_datadog_key --l1-rpc-url your_rpc_url --otel-collector-endpoint-url your_otel_url --debug" echo - echo " Destroy with specific environment and debug mode:" - echo " $0 destroy --environment devenv --debug" + echo " Destroy all jobs but backup before do so:" + echo " $0 destroy --backup --debug" exit 1 } @@ -80,7 +82,7 @@ usage() { echo "Usage:" echo "$0 [init [--environment ] [--skip-certificates-setup] [--debug]]" echo "$0 [deploy [version=HEAD] [--environment ] [--profile ] [--force-build-templates] [--no-logs-collection] [--datadog-key ] [--l1-rpc-url ] [--otel-collector-endpoint-url ] [--release] [--debug]]" - echo "$0 [destroy [--debug]] [--help]" + echo "$0 [destroy [--backup] [--debug]]" echo "$0 --help" exit 1 } @@ -243,6 +245,10 @@ parse_args() { destroy) destroy_flag=true shift + if [[ $# -gt 0 && $1 == "--backup" ]]; then + backup_flag=true + shift + fi if [[ $# -gt 0 && $1 == "--debug" ]]; then debug_flag=true shift @@ -288,6 +294,7 @@ main() { ;; "${destroy_flag}") playbook+="destroy.yml" + [[ "${backup_flag}" == true ]] && flags+=("--extra-vars" "backup=true") ;; *) usage diff --git a/infrastructure/nomad/playbooks/deploy.yml b/infrastructure/nomad/playbooks/deploy.yml index 9c4f1531a..de606e524 100644 --- a/infrastructure/nomad/playbooks/deploy.yml +++ b/infrastructure/nomad/playbooks/deploy.yml @@ -84,7 +84,7 @@ when: version is not defined or version == '' or release - name: Set Artifacts Build Version - set_fact: + ansible.builtin.set_fact: build_artifacts: true version: "{{ artifacts_build_version.stdout }}" when: version is not defined or version == '' or release @@ -124,11 +124,11 @@ success_msg: "The profile name is set to: {{ profile }}." - name: Set Jobs Definition - set_fact: + ansible.builtin.set_fact: jobs: "{{ profiles[profile].jobs }}" - name: Disable Logs Collection - set_fact: + ansible.builtin.set_fact: jobs: >- {{ jobs @@ -138,7 +138,7 @@ when: no_logs_collection | default(false) | bool - name: Disable OpenTelemetry Trace Collection - set_fact: + ansible.builtin.set_fact: jobs: >- {{ jobs @@ -166,7 +166,7 @@ register: existing_environment - name: Set Existing Scripts Artifact Version as Stale - set_fact: + ansible.builtin.set_fact: build_templates: >- {{ (existing_environment.stdout | from_json) != environments[env] @@ -187,6 +187,16 @@ Build Templates: {{ 'yes' if build_templates | default(false) else 'no' }} tasks: + - name: Determine Cluster Status + ansible.builtin.shell: | + STATUS=$(nomad job status -json) + if [ "${STATUS}" != "No running jobs" ]; then + echo "Cluster has running jobs." + exit 1 + fi + args: + executable: bash + - name: Build keystore-generator ansible.builtin.shell: | BINARY_PATH="{{ dist_dir }}/keystore-generator-{{ environments[env].version }}" @@ -654,6 +664,7 @@ ansible.builtin.shell: | nomad var purge "nomad/jobs" nomad system gc + nomad var put -namespace=default "nomad/jobs" MEV_COMMIT_GETH_CHAIN_BACKUP="false" args: executable: bash diff --git a/infrastructure/nomad/playbooks/destroy.yml b/infrastructure/nomad/playbooks/destroy.yml index b58151b2a..e6801da21 100644 --- a/infrastructure/nomad/playbooks/destroy.yml +++ b/infrastructure/nomad/playbooks/destroy.yml @@ -2,20 +2,67 @@ hosts: nomad_clients gather_facts: no + vars: + nomad_vars_path: "nomad/jobs" + tasks: - - name: Stop and Purge Jobs + - name: Set Backup Var + ansible.builtin.shell: | + nomad var put -force -namespace=default {{ nomad_vars_path }} MEV_COMMIT_GETH_CHAIN_BACKUP="true" + args: + executable: bash + when: backup is defined and backup + + - name: Stop Jobs ansible.builtin.shell: | - for job in $(nomad job status -json | jq -r '.[].Summary.JobID'); do + NOMAD_JOBS=$(nomad job status -json | jq -r '.[].Summary.JobID') + for job in $(echo "${NOMAD_JOBS}" | grep -v artifacts); do if [ "${job}" != "null" ]; then - nomad stop -purge "${job}" + nomad stop "${job}" + + TIMEOUT=60 + while true; do + STATUS=$(nomad job status -json "${job}" | jq -r '.[0].Allocations[0].ClientStatus') + case "${STATUS}" in + "failed" | "complete") + break + ;; + *) + sleep 1 + TIMEOUT=$((TIMEOUT - 1)) + if [ "${TIMEOUT}" -eq 0 ]; then + echo "Timeout waiting for ${job} to stop has been exceeded." + return 1 + fi + ;; + esac + done fi done + + if echo "${NOMAD_JOBS}" | grep -q artifacts; then + nomad stop -yes "artifacts" + fi args: executable: bash - - name: Purge Cluster + - name: Purge Stopped Jobs ansible.builtin.shell: | - nomad var purge "nomad/jobs" + TIMEOUT=30 + while [ "$(nomad job status)" != "No running jobs" ]; do + RUNNING_JOBS=$(nomad job status -json | jq '[.[] | select(.Summary.Summary | to_entries[] | select(.value.Running > 0 or .value.Starting > 0)) | .Summary.JobID]') + if [ "${RUNNING_JOBS}" = "[]" ]; then + break + fi + sleep 1 + TIMEOUT=$((TIMEOUT - 1)) + if [ "${TIMEOUT}" -eq 0 ]; then + echo "Timeout waiting for jobs to stop has been exceeded." + return 1 + fi + done + + nomad var purge {{ nomad_vars_path }} nomad system gc args: executable: bash diff --git a/infrastructure/nomad/playbooks/init.yml b/infrastructure/nomad/playbooks/init.yml index 4f2335735..b8c3d51d4 100644 --- a/infrastructure/nomad/playbooks/init.yml +++ b/infrastructure/nomad/playbooks/init.yml @@ -115,6 +115,15 @@ become: true become_user: "{{ ansible_user }}" + - name: Ensure "/tmp/{{ env }}" Directory Exists + ansible.builtin.file: + path: "/tmp/{{ env }}" + state: directory + mode: "0744" + recurse: yes + become: true + become_user: "{{ ansible_user }}" + tasks: - name: Add DataDog Repository Key ansible.builtin.apt_key: diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 index f2135b00e..607a30788 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 @@ -36,8 +36,15 @@ job "{{ job.name }}" { {% endfor %} } + volume "tmp-volume" { + type = "host" + source = "tmp-volume" + read_only = false + } + task "node" { driver = "exec" + kill_timeout = "25s" {% if profile == 'testnet' %} resources { @@ -79,7 +86,7 @@ job "{{ job.name }}" { template { data = <<-EOH {%- raw %} - GETH_DATA_DIR="local/data-{{ env "NOMAD_ALLOC_INDEX" }}" + GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}" {% endraw %} GENESIS_L1_PATH="local/genesis_{{ version }}.json" GETH_BIN_PATH="local/mev-commit-geth" @@ -102,7 +109,7 @@ job "{{ job.name }}" { {% endraw %} {% elif job.env['type'] == 'signer' %} {%- raw %} - GETH_KEYSTORE_DIR="/local/data-{{ env "NOMAD_ALLOC_INDEX" }}/keystore" + GETH_KEYSTORE_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}/keystore" GETH_KEYSTORE_FILENAME="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_filename }}{{ end }}" GETH_KEYSTORE_PASSWORD="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_password }}{{ end }}" {% endraw %} @@ -150,7 +157,94 @@ job "{{ job.name }}" { {% endif %} chmod +x local/mev-commit-geth local/entrypoint.sh - local/entrypoint.sh + exec local/entrypoint.sh + EOH + destination = "local/run.sh" + perms = "0755" + } + + config { + command = "bash" + args = ["-c", "exec local/run.sh"] + } + } + + task "backup" { + driver = "exec" + + lifecycle { + hook = "poststop" + } + + {% if env == 'testenv' %} + resources { + cores = 6 + memory = 32768 + } + {% endif %} + + volume_mount { + volume = "tmp-volume" + destination = "/local/backups" + read_only = false + } + + {% if env != 'devenv' %} + artifact { + source = "https://primev-infrastructure-artifacts.s3.us-west-2.amazonaws.com/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% else %} + artifact { + source = "http://{{ ansible_facts['default_ipv4']['address'] }}:1111/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% endif %} + + template { + data = <<-EOH + {%- raw %} + GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}" + {% endraw %} + EOH + destination = "secrets/.env" + env = true + } + + template { + data = <<-EOH + #!/usr/bin/env bash + + {% raw %} + {{ with nomadVar "nomad/jobs" }} + BACKUP="{{ .MEV_COMMIT_GETH_CHAIN_BACKUP }}" + {{ end }} + {% endraw %} + + if [[ "${BACKUP}" != "true" ]]; then + echo "Backup not requested, skipping..." + exit 0 + fi + + BACKUP_FILE="local/backups/{{ version }}_{{ job.name }}-{% raw %}{{ env "NOMAD_ALLOC_INDEX" }}{% endraw %}_$(date +%Y%m%d%H%M%S)" + STATUS=$(nomad alloc status -address="http://{{ ansible_facts['default_ipv4']['address'] }}:4646" -json "${NOMAD_ALLOC_ID}") + NON_ZERO_EXIT_EVENTS=$(echo "$STATUS" | jq -r '.TaskStates.node.Events[] | select(.ExitCode != 0)') + if [[ -n "${NON_ZERO_EXIT_EVENTS}" ]]; then + echo "The main task did not start or finish gracefully." + BACKUP_FILE+="-dirty" + fi + BACKUP_FILE+=".tar.gz" + + chmod +x local/mev-commit-geth + local/mev-commit-geth export \ + --cache 4096 \ + --syncmode "full" \ + --datadir ${GETH_DATA_DIR} \ + ${BACKUP_FILE} + + if [[ -f "${BACKUP_FILE}" ]]; then + echo "Backup successful." + else + echo "Backup failed." + fi EOH destination = "local/run.sh" perms = "0755" diff --git a/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 b/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 index 78c5fdca6..f45abf2e6 100644 --- a/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 +++ b/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 @@ -28,6 +28,9 @@ client { artifact { decompression_file_count_limit = 8192 } + host_volume "tmp-volume" { + path = "/tmp/{{ env }}" + } {% if env == "devenv" %} host_volume "artifacts-volume" { path = "{{ ansible_user_home }}/{{ env }}/artifacts"