diff --git a/infrastructure/nomad/cluster.sh b/infrastructure/nomad/cluster.sh index 605520fc7..afab4c8ae 100755 --- a/infrastructure/nomad/cluster.sh +++ b/infrastructure/nomad/cluster.sh @@ -10,6 +10,7 @@ no_logs_collection_flag=false force_build_templates_flag=false skip_certificates_setup_flag=false release_flag=false +backup_flag=false deploy_version="HEAD" environment_name="devenv" profile_name="devnet" @@ -21,7 +22,7 @@ help() { echo "Usage:" echo "$0 [init [--environment ] [--skip-certificates-setup] [--debug]]" echo "$0 [deploy [version=HEAD] [--environment ] [--profile ] [--force-build-templates] [--no-logs-collection] [--datadog-key ] [--l1-rpc-url ] [--otel-collector-endpoint-url ] [--release] [--debug]]" - echo "$0 [destroy [--debug]] [--help]" + echo "$0 [destroy [--backup] [--debug]]" echo "$0 --help" echo echo "Parameters:" @@ -42,7 +43,8 @@ help() { echo " --debug Enable debug mode for detailed output." echo echo " destroy Destroy the whole cluster." - echo " --debug Enable debug mode for detailed output." + echo " --backup Create a backup before destroying the environment." + echo " --debug Enable debug mode for detailed output." echo echo " --help Display this help message." echo @@ -71,8 +73,8 @@ help() { echo " Deploy with a specific version, environment, profile in debug mode with disabled logs collection, Datadog API key, L1 RPC URL, and OpenTememetry Collector Endpoint URL:" echo " $0 deploy v0.1.0 --environment devenv --profile testnet --no-logs-collection --datadog-key your_datadog_key --l1-rpc-url your_rpc_url --otel-collector-endpoint-url your_otel_url --debug" echo - echo " Destroy with specific environment and debug mode:" - echo " $0 destroy --environment devenv --debug" + echo " Destroy all jobs but backup before do so:" + echo " $0 destroy --backup --debug" exit 1 } @@ -80,7 +82,7 @@ usage() { echo "Usage:" echo "$0 [init [--environment ] [--skip-certificates-setup] [--debug]]" echo "$0 [deploy [version=HEAD] [--environment ] [--profile ] [--force-build-templates] [--no-logs-collection] [--datadog-key ] [--l1-rpc-url ] [--otel-collector-endpoint-url ] [--release] [--debug]]" - echo "$0 [destroy [--debug]] [--help]" + echo "$0 [destroy [--backup] [--debug]]" echo "$0 --help" exit 1 } @@ -243,6 +245,10 @@ parse_args() { destroy) destroy_flag=true shift + if [[ $# -gt 0 && $1 == "--backup" ]]; then + backup_flag=true + shift + fi if [[ $# -gt 0 && $1 == "--debug" ]]; then debug_flag=true shift @@ -288,6 +294,7 @@ main() { ;; "${destroy_flag}") playbook+="destroy.yml" + [[ "${backup_flag}" == true ]] && flags+=("--extra-vars" "backup=true") ;; *) usage diff --git a/infrastructure/nomad/playbooks/deploy.yml b/infrastructure/nomad/playbooks/deploy.yml index 9c4f1531a..de606e524 100644 --- a/infrastructure/nomad/playbooks/deploy.yml +++ b/infrastructure/nomad/playbooks/deploy.yml @@ -84,7 +84,7 @@ when: version is not defined or version == '' or release - name: Set Artifacts Build Version - set_fact: + ansible.builtin.set_fact: build_artifacts: true version: "{{ artifacts_build_version.stdout }}" when: version is not defined or version == '' or release @@ -124,11 +124,11 @@ success_msg: "The profile name is set to: {{ profile }}." - name: Set Jobs Definition - set_fact: + ansible.builtin.set_fact: jobs: "{{ profiles[profile].jobs }}" - name: Disable Logs Collection - set_fact: + ansible.builtin.set_fact: jobs: >- {{ jobs @@ -138,7 +138,7 @@ when: no_logs_collection | default(false) | bool - name: Disable OpenTelemetry Trace Collection - set_fact: + ansible.builtin.set_fact: jobs: >- {{ jobs @@ -166,7 +166,7 @@ register: existing_environment - name: Set Existing Scripts Artifact Version as Stale - set_fact: + ansible.builtin.set_fact: build_templates: >- {{ (existing_environment.stdout | from_json) != environments[env] @@ -187,6 +187,16 @@ Build Templates: {{ 'yes' if build_templates | default(false) else 'no' }} tasks: + - name: Determine Cluster Status + ansible.builtin.shell: | + STATUS=$(nomad job status -json) + if [ "${STATUS}" != "No running jobs" ]; then + echo "Cluster has running jobs." + exit 1 + fi + args: + executable: bash + - name: Build keystore-generator ansible.builtin.shell: | BINARY_PATH="{{ dist_dir }}/keystore-generator-{{ environments[env].version }}" @@ -654,6 +664,7 @@ ansible.builtin.shell: | nomad var purge "nomad/jobs" nomad system gc + nomad var put -namespace=default "nomad/jobs" MEV_COMMIT_GETH_CHAIN_BACKUP="false" args: executable: bash diff --git a/infrastructure/nomad/playbooks/destroy.yml b/infrastructure/nomad/playbooks/destroy.yml index b58151b2a..fe302d0dd 100644 --- a/infrastructure/nomad/playbooks/destroy.yml +++ b/infrastructure/nomad/playbooks/destroy.yml @@ -2,20 +2,67 @@ hosts: nomad_clients gather_facts: no + vars: + nomad_vars_path: "nomad/jobs" + tasks: - - name: Stop and Purge Jobs + - name: Set Backup Var + ansible.builtin.shell: | + nomad var put -force -namespace=default {{ nomad_vars_path }} MEV_COMMIT_GETH_CHAIN_BACKUP="true" + args: + executable: bash + when: backup is defined and backup + + - name: Stop Jobs ansible.builtin.shell: | - for job in $(nomad job status -json | jq -r '.[].Summary.JobID'); do + NOMAD_JOBS=$(nomad job status -json | jq -r '.[].Summary.JobID') + for job in $(echo "${NOMAD_JOBS}" | grep -v artifacts); do if [ "${job}" != "null" ]; then - nomad stop -purge "${job}" + nomad stop "${job}" + + TIMEOUT=600 + while true; do + STATUS=$(nomad job status -json "${job}" | jq -r '.[0].Allocations[0].ClientStatus') + case "${STATUS}" in + "failed" | "complete") + break + ;; + *) + sleep 1 + TIMEOUT=$((TIMEOUT - 1)) + if [ "${TIMEOUT}" -eq 0 ]; then + echo "Timeout waiting for ${job} to stop has been exceeded." + return 1 + fi + ;; + esac + done fi done + + if echo "${NOMAD_JOBS}" | grep -q artifacts; then + nomad stop -yes "artifacts" + fi args: executable: bash - - name: Purge Cluster + - name: Purge Stopped Jobs ansible.builtin.shell: | - nomad var purge "nomad/jobs" + TIMEOUT=30 + while [ "$(nomad job status)" != "No running jobs" ]; do + RUNNING_JOBS=$(nomad job status -json | jq '[.[] | select(.Summary.Summary | to_entries[] | select(.value.Running > 0 or .value.Starting > 0)) | .Summary.JobID]') + if [ "${RUNNING_JOBS}" = "[]" ]; then + break + fi + sleep 1 + TIMEOUT=$((TIMEOUT - 1)) + if [ "${TIMEOUT}" -eq 0 ]; then + echo "Timeout waiting for jobs to stop has been exceeded." + return 1 + fi + done + + nomad var purge {{ nomad_vars_path }} nomad system gc args: executable: bash diff --git a/infrastructure/nomad/playbooks/init.yml b/infrastructure/nomad/playbooks/init.yml index 4f2335735..b8c3d51d4 100644 --- a/infrastructure/nomad/playbooks/init.yml +++ b/infrastructure/nomad/playbooks/init.yml @@ -115,6 +115,15 @@ become: true become_user: "{{ ansible_user }}" + - name: Ensure "/tmp/{{ env }}" Directory Exists + ansible.builtin.file: + path: "/tmp/{{ env }}" + state: directory + mode: "0744" + recurse: yes + become: true + become_user: "{{ ansible_user }}" + tasks: - name: Add DataDog Repository Key ansible.builtin.apt_key: diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 index f2135b00e..7099164e3 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 @@ -36,8 +36,137 @@ job "{{ job.name }}" { {% endfor %} } + volume "tmp-volume" { + type = "host" + source = "tmp-volume" + read_only = false + } + + task "restore" { + driver = "exec" + + lifecycle { + hook = "prestart" + } + + {% if env == 'testenv' %} + resources { + cores = 6 + memory = 32768 + } + {% endif %} + + volume_mount { + volume = "tmp-volume" + destination = "/local/backups" + read_only = false + } + + {% if env != 'devenv' %} + artifact { + source = "https://primev-infrastructure-artifacts.s3.us-west-2.amazonaws.com/genesis_{{ version }}.json" + } + artifact { + source = "https://primev-infrastructure-artifacts.s3.us-west-2.amazonaws.com/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% else %} + artifact { + source = "http://{{ ansible_facts['default_ipv4']['address'] }}:1111/genesis_{{ version }}.json" + } + artifact { + source = "http://{{ ansible_facts['default_ipv4']['address'] }}:1111/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% endif %} + + template { + data = <<-EOH + {%- raw %} + GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}" + {% endraw %} + GENESIS_L1_PATH="local/genesis_{{ version }}.json" + GETH_LOG_FORMAT="{{ job.env.get('log-format', 'json') }}" + GETH_LOG_TAGS="{{ + job.env['log-tags'].items() | map('join', ':') | join('; ') + if job.env['log-tags'] is defined and job.env['log-tags'] + else 'service:' + job.name + '-{{ env "NOMAD_ALLOC_INDEX" }}' + }}" + EOH + destination = "secrets/.env" + env = true + } + + template { + data = <<-EOH + #!/usr/bin/env bash + + {%- raw %} + {{- range nomadService "datadog-agent-logs-collector" }} + {{ if contains "tcp" .Tags }} + exec > >(nc {{ .Address }} {{ .Port }}) 2>&1 + {{ end }} + {{- end }} + {% endraw %} + + BACKUP_FILE="local/backups/$( + ls -1 local/backups/ | + sed -r 's/(.*)_dirty/\1/' | + awk -v jobname="{{ job.name }}-{% raw %}{{ env "NOMAD_ALLOC_INDEX" }}{% endraw %}" -F'_' '$2 == jobname {print $3, $0}' | + sort -k1,1r | + head -n 1 | + cut -d' ' -f2 + )" + if [[ ! -f "${BACKUP_FILE}" ]]; then + echo "No backup found, skipping restore" + exit 0 + fi + + chmod +x local/mev-commit-geth + CHAIN_ID=$(cat "${GENESIS_L1_PATH}" | jq -r .config.chainId) + + if [[ ! -d "${GETH_DATA_DIR}/geth/chaindata" ]]; then + echo "Initializing chain data directory" + local/mev-commit-geth \ + --db.engine=pebble \ + --state.scheme=path \ + --log.format="${GETH_LOG_FORMAT}" \ + --log.tags="${GETH_LOG_TAGS}" \ + --datadir="${GETH_DATA_DIR}" \ + init "${GENESIS_L1_PATH}" + else + echo "Chain data directory already exists, skipping restore" + exit 0 + fi + + echo "Restoring from backup file ${BACKUP_FILE} to ${GETH_DATA_DIR}" + echo "Backup file size: $(du -h ${BACKUP_FILE} | cut -f1)" + local/mev-commit-geth \ + --cache=4096 \ + --verbosity=5 \ + --log.format="${GETH_LOG_FORMAT}" \ + --log.tags="${GETH_LOG_TAGS}" \ + --datadir="${GETH_DATA_DIR}" \ + import ${BACKUP_FILE} + + if [[ $? -eq 0 ]]; then + echo "Restore successful" + else + echo "Restore failed" + exit 1 + fi + EOH + destination = "local/run.sh" + perms = "0755" + } + + config { + command = "bash" + args = ["-c", "local/run.sh"] + } + } + task "node" { driver = "exec" + kill_timeout = "25s" {% if profile == 'testnet' %} resources { @@ -79,7 +208,7 @@ job "{{ job.name }}" { template { data = <<-EOH {%- raw %} - GETH_DATA_DIR="local/data-{{ env "NOMAD_ALLOC_INDEX" }}" + GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}" {% endraw %} GENESIS_L1_PATH="local/genesis_{{ version }}.json" GETH_BIN_PATH="local/mev-commit-geth" @@ -102,7 +231,7 @@ job "{{ job.name }}" { {% endraw %} {% elif job.env['type'] == 'signer' %} {%- raw %} - GETH_KEYSTORE_DIR="/local/data-{{ env "NOMAD_ALLOC_INDEX" }}/keystore" + GETH_KEYSTORE_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}/keystore" GETH_KEYSTORE_FILENAME="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_filename }}{{ end }}" GETH_KEYSTORE_PASSWORD="{{ with secret "secret/data/mev-commit" }}{{ .Data.data.{% endraw %}{{ job.artifacts | selectattr('keystore', 'defined') | map(attribute='keystore.name') | first }}{% raw %}_password }}{{ end }}" {% endraw %} @@ -150,7 +279,113 @@ job "{{ job.name }}" { {% endif %} chmod +x local/mev-commit-geth local/entrypoint.sh - local/entrypoint.sh + exec local/entrypoint.sh + EOH + destination = "local/run.sh" + perms = "0755" + } + + config { + command = "bash" + args = ["-c", "exec local/run.sh"] + } + } + + task "backup" { + driver = "exec" + + lifecycle { + hook = "poststop" + } + + {% if env == 'testenv' %} + resources { + cores = 6 + memory = 32768 + } + {% endif %} + + volume_mount { + volume = "tmp-volume" + destination = "/local/backups" + read_only = false + } + + {% if env != 'devenv' %} + artifact { + source = "https://primev-infrastructure-artifacts.s3.us-west-2.amazonaws.com/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% else %} + artifact { + source = "http://{{ ansible_facts['default_ipv4']['address'] }}:1111/mev-commit-geth_{{ version }}_Linux_{{ target_system_architecture }}.tar.gz" + } + {% endif %} + + template { + data = <<-EOH + {%- raw %} + GETH_DATA_DIR="/alloc/data/node-{{ env "NOMAD_ALLOC_INDEX" }}" + {% endraw %} + GETH_LOG_FORMAT="{{ job.env.get('log-format', 'json') }}" + GETH_LOG_TAGS="{{ + job.env['log-tags'].items() | map('join', ':') | join('; ') + if job.env['log-tags'] is defined and job.env['log-tags'] + else 'service:' + job.name + '-{{ env "NOMAD_ALLOC_INDEX" }}' + }}" + EOH + destination = "secrets/.env" + env = true + } + + template { + data = <<-EOH + #!/usr/bin/env bash + + {%- raw %} + {{- range nomadService "datadog-agent-logs-collector" }} + {{ if contains "tcp" .Tags }} + exec > >(nc {{ .Address }} {{ .Port }}) 2>&1 + {{ end }} + {{- end }} + {% endraw %} + + {% raw %} + {{ with nomadVar "nomad/jobs" }} + BACKUP="{{ .MEV_COMMIT_GETH_CHAIN_BACKUP }}" + {{ end }} + {% endraw %} + + if [[ "${BACKUP}" != "true" ]]; then + echo "Backup not requested, skipping..." + exit 0 + fi + + BACKUP_FILE="local/backups/{{ version }}_{{ job.name }}-{% raw %}{{ env "NOMAD_ALLOC_INDEX" }}{% endraw %}_$(date +%Y%m%d%H%M%S)" + STATUS=$(nomad alloc status -address="http://{{ ansible_facts['default_ipv4']['address'] }}:4646" -json "${NOMAD_ALLOC_ID}") + NON_ZERO_EXIT_EVENTS=$(echo "$STATUS" | jq -r '.TaskStates.node.Events[] | select(.ExitCode != 0)') + if [[ -n "${NON_ZERO_EXIT_EVENTS}" ]]; then + echo "The main task did not start or finish gracefully" + BACKUP_FILE+="-dirty" + fi + BACKUP_FILE+=".rlp" + + echo "Exporting chain data from ${GETH_DATA_DIR} to backup file: ${BACKUP_FILE}" + chmod +x local/mev-commit-geth + local/mev-commit-geth \ + --cache 4096 \ + --verbosity=5 \ + --log.format="$GETH_LOG_FORMAT" \ + --log.tags="$GETH_LOG_TAGS" \ + --datadir="${GETH_DATA_DIR}" \ + export ${BACKUP_FILE} + + if [[ "$?" -eq 0 ]] && [[ -f "${BACKUP_FILE}" ]]; then + echo "Backup successful" + echo "Backup file size: $(du -h ${BACKUP_FILE} | cut -f1)" + else + echo "Backup failed" + exit 1 + fi EOH destination = "local/run.sh" perms = "0755" diff --git a/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 b/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 index 78c5fdca6..f45abf2e6 100644 --- a/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 +++ b/infrastructure/nomad/playbooks/templates/services/nomad.hcl.j2 @@ -28,6 +28,9 @@ client { artifact { decompression_file_count_limit = 8192 } + host_volume "tmp-volume" { + path = "/tmp/{{ env }}" + } {% if env == "devenv" %} host_volume "artifacts-volume" { path = "{{ ansible_user_home }}/{{ env }}/artifacts"