From 003111e33dfc677eb734a4ee73a57283a9ea59f2 Mon Sep 17 00:00:00 2001 From: mrekucci Date: Mon, 8 Jul 2024 15:34:34 +0200 Subject: [PATCH] feat(infra): fail fast on first deployment error --- .github/workflows/infrastructure.yml | 5 +- infrastructure/nomad/ansible.cfg.example | 5 +- infrastructure/nomad/playbooks/deploy.yml | 59 +++---------------- .../templates/jobs/artifacts.nomad.j2 | 10 ++++ .../jobs/contracts-deployer.nomad.j2 | 36 ++++++++++- .../templates/jobs/datadog-agent.nomad.j2 | 12 ++++ .../templates/jobs/mev-commit-bridge.nomad.j2 | 12 ++++ .../jobs/mev-commit-emulator.nomad.j2 | 12 ++++ .../templates/jobs/mev-commit-faucet.nomad.j2 | 12 ++++ .../templates/jobs/mev-commit-funder.nomad.j2 | 12 ++++ .../templates/jobs/mev-commit-geth.nomad.j2 | 12 ++++ .../templates/jobs/mev-commit-oracle.nomad.j2 | 36 ++++++++--- .../templates/jobs/mev-commit.nomad.j2 | 12 ++++ p2p/examples/provideremulator/main.go | 2 - p2p/integrationtest/provider/main.go | 2 - 15 files changed, 169 insertions(+), 70 deletions(-) diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 8f616bc91..9517b1b87 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -143,8 +143,9 @@ jobs: chmod 700 ~/.ssh cat <<-EOH >> ~/.ssh/config Host * - ControlMaster auto - ControlPath /tmp/ssh_mux_%h_%p_%r + ControlMaster no + ControlPath none + ForwardAgent yes ControlPersist 60m EOH diff --git a/infrastructure/nomad/ansible.cfg.example b/infrastructure/nomad/ansible.cfg.example index ca3b403da..a829d22bb 100644 --- a/infrastructure/nomad/ansible.cfg.example +++ b/infrastructure/nomad/ansible.cfg.example @@ -4,7 +4,7 @@ stdout_callback = yaml callbacks_enabled = profile_tasks forks = 50 retry_files_enabled = False -timeout = 10 +timeout = 600 connection = ssh pipelining = True host_key_checking = False @@ -16,6 +16,3 @@ module_defaults: gather_subset: - '!all' - 'min' - -[ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=60m -o ControlPath=/tmp/ssh_mux_%h_%p_%r -o ForwardAgent=yes diff --git a/infrastructure/nomad/playbooks/deploy.yml b/infrastructure/nomad/playbooks/deploy.yml index e52083f22..555505f89 100644 --- a/infrastructure/nomad/playbooks/deploy.yml +++ b/infrastructure/nomad/playbooks/deploy.yml @@ -627,57 +627,11 @@ - name: Deploy Jobs ansible.builtin.shell: | + set -x + START_TIME="$(date +%s)" nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad - - TIMEOUT={% if profile == 'ci' %}600{% else %}300{% endif %} - START_TIME=$(date +%s) - RESULT=$(nomad job status -json "{{ job.name }}") - if [ $? -ne 0 ]; then - echo "Failed to get job status for {{ job.name }}:" - echo "${RESULT}" - exit 1 - fi - JOB_TYPE=$(echo "${RESULT}" | jq -r '.[0].Allocations[0].JobType') - - while true; do - STATUS=$(echo "${RESULT}" | jq -r '.[0].Allocations[0].ClientStatus') - - case "${JOB_TYPE}" in - service) - if [ "${STATUS}" = "running" ]; then - break - fi - ;; - batch) - if [ "${STATUS}" = "complete" ]; then - break - fi - ;; - *) - {% if env != 'devenv' %} - break - {% else %} - echo "Unknown job type: ${JOB_TYPE}" - exit 1 - {% endif %} - ;; - esac - - CURRENT_TIME="$(date +%s)" - ELAPSED_TIME="$(( CURRENT_TIME - START_TIME ))" - if [ ${ELAPSED_TIME} -ge ${TIMEOUT} ]; then - echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}" - exit 1 - fi - - sleep 1 - RESULT=$(nomad job status -json "{{ job.name }}") - if [ $? -ne 0 ]; then - echo "Failed to get job status for {{ job.name }}:" - echo "${RESULT}" - exit 1 - fi - done + END_TIME="$(date +%s)" + echo "Deployment of {{ job.name }} took $(date -ud "@$((END_TIME - START_TIME))" +'%H:%M:%S')" args: executable: bash loop: "{{ jobs }}" @@ -685,8 +639,9 @@ label: "{{ item.name }}" vars: job: "{{ item }}" - register: result - failed_when: result.rc != 0 + register: job_result + failed_when: job_result.rc != 0 + when: job_result is not defined or job_result.rc == 0 - name: Post Deployment Info ansible.builtin.debug: diff --git a/infrastructure/nomad/playbooks/templates/jobs/artifacts.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/artifacts.nomad.j2 index eaf875527..a5a30ec05 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/artifacts.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/artifacts.nomad.j2 @@ -45,6 +45,16 @@ job "{% if env != 'devenv' %}{{ environments[env].version }}{% else %}artifacts- group "artifacts-group" { count = 1 + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/contracts-deployer.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/contracts-deployer.nomad.j2 index 6f75af951..afaadb51c 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/contracts-deployer.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/contracts-deployer.nomad.j2 @@ -5,6 +5,32 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + # This is a special case for CI because the runner machine is not very + # powerful and compiling and deploying contracts can take a long time. + {% if env == 'devenv' and profile == 'ci' %} + update { + healthy_deadline = "20m" + progress_deadline = "30m" + } + {% else %} + update { + healthy_deadline = "10m" + progress_deadline = "15m" + } + {% endif %} + + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { dns { servers = {{ (ansible_facts['dns']['nameservers'] + ['1.1.1.1']) | tojson }} @@ -31,6 +57,14 @@ job "{{ job.name }}" { port = "{{ port_name }}" tags = ["{{ port_name }}"] provider = "nomad" + + check { + type = "http" + path = "/" + port = "{{ port_name }}" + interval = "10s" + timeout = "5s" + } } {% endfor %} @@ -145,7 +179,7 @@ job "{{ job.name }}" { echo "Failed to transfer ownership!" exit 1 fi - echo "Ownership transfered successfully." + echo "Ownership transferred successfully." python3 -m http.server {{ job.ports[0]['http']['static'] }} --directory /local/www # endtodo diff --git a/infrastructure/nomad/playbooks/templates/jobs/datadog-agent.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/datadog-agent.nomad.j2 index 6d6756db9..0f9cf834f 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/datadog-agent.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/datadog-agent.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-bridge.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-bridge.nomad.j2 index d6ebbbe4d..d3daeded1 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-bridge.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-bridge.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-emulator.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-emulator.nomad.j2 index 1a387c401..afa29d354 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-emulator.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-emulator.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-faucet.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-faucet.nomad.j2 index 674abb253..4d4c58479 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-faucet.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-faucet.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-funder.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-funder.nomad.j2 index f7f18ec36..6f86b01b5 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-funder.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-funder.nomad.j2 @@ -6,6 +6,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { dns { servers = {{ (ansible_facts['dns']['nameservers'] + ['1.1.1.1']) | tojson }} diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 index 2f04b0399..c78d8c3ee 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-geth.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-oracle.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-oracle.nomad.j2 index 2f232f817..82f45f2b0 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit-oracle.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit-oracle.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" @@ -87,16 +99,15 @@ job "{{ job.name }}" { mkdir -p /var/run/postgresql > /dev/null 2>&1 pg_ctl initdb --silent --pgdata="${POSTGRES_DATA}" if [ $? -ne 0 ]; then - echo "Failed to initialize database." + echo "Failed to initialize PostgreSQL." exit 1 fi cp /alloc/data/postgres.env "${POSTGRES_DATA}/.env" - postgres -D "${POSTGRES_DATA}" & - pid=$! - if ! timeout 5m bash -c 'until pg_ctl status --pgdata="${POSTGRES_DATA}" --silent --no-wait; do sleep 1; done'; then - echo "Waiting for PostgreSQL to start..." - sleep 1 + pg_ctl start --pgdata="${POSTGRES_DATA}" --silent --wait --timeout=300 > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "Failed to start PostgreSQL." + exit 1 fi createuser --superuser postgres > /dev/null 2>&1 @@ -109,7 +120,13 @@ job "{{ job.name }}" { GRANT ALL PRIVILEGES ON DATABASE ${POSTGRES_DB} TO ${POSTGRES_USERNAME};" echo "Database initialized and configured successfully." - wait $pid + pg_ctl stop --pgdata="${POSTGRES_DATA}" --silent --wait --timeout=300 > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "Failed to stop PostgreSQL." + exit 1 + fi + + postgres -D "${POSTGRES_DATA}" {% endraw %} EOH destination = "local/run.sh" @@ -304,6 +321,11 @@ job "{{ job.name }}" { export MEV_ORACLE_PG_PASSWORD="${POSTGRES_PASSWORD}" export MEV_ORACLE_PG_DBNAME="${POSTGRES_DB}" + if ! timeout 5m bash -c 'until pg_isready -h ${MEV_ORACLE_PG_HOST} -p ${MEV_ORACLE_PG_PORT} -U ${MEV_ORACLE_PG_USER} -d ${MEV_ORACLE_PG_DBNAME}; do sleep 1; done'; then + echo "Waiting for PostgreSQL to start..." + sleep 1 + fi + chmod +x local/mev-commit-oracle local/mev-commit-oracle start {% endraw %} diff --git a/infrastructure/nomad/playbooks/templates/jobs/mev-commit.nomad.j2 b/infrastructure/nomad/playbooks/templates/jobs/mev-commit.nomad.j2 index c10eeefc9..df0f366ad 100644 --- a/infrastructure/nomad/playbooks/templates/jobs/mev-commit.nomad.j2 +++ b/infrastructure/nomad/playbooks/templates/jobs/mev-commit.nomad.j2 @@ -5,6 +5,18 @@ job "{{ job.name }}" { group "{{ job.name }}-group" { count = {{ job.count }} + {% if env == 'devenv' %} + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + attempts = 0 + unlimited = false + } + {% endif %} + network { mode = "bridge" diff --git a/p2p/examples/provideremulator/main.go b/p2p/examples/provideremulator/main.go index 329279481..2820064d1 100644 --- a/p2p/examples/provideremulator/main.go +++ b/p2p/examples/provideremulator/main.go @@ -54,8 +54,6 @@ func main() { return } - fmt.Printf("connected to provider %s, receiving bids...\n", *serverAddr) - for { select { case bid, more := <-bidS: diff --git a/p2p/integrationtest/provider/main.go b/p2p/integrationtest/provider/main.go index 2e55ca74c..01a0bba6d 100644 --- a/p2p/integrationtest/provider/main.go +++ b/p2p/integrationtest/provider/main.go @@ -131,8 +131,6 @@ func main() { return } - fmt.Printf("connected to provider %s, receiving bids...\n", *serverAddr) - for bid := range bidS { receivedBids.Inc() buf, err := json.Marshal(bid)