Skip to content

Commit

Permalink
feat(infra): fail fast on first deployment error
Browse files Browse the repository at this point in the history
  • Loading branch information
mrekucci committed Jul 18, 2024
1 parent 38a1173 commit d40be37
Show file tree
Hide file tree
Showing 15 changed files with 175 additions and 20 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/infrastructure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,8 @@ jobs:
chmod 700 ~/.ssh
cat <<-EOH >> ~/.ssh/config
Host *
ControlMaster auto
ControlPath /tmp/ssh_mux_%h_%p_%r
ControlPersist 60m
ControlMaster no
ControlPath none
EOH
ANSIBLE_USER="${USER}"
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/nomad/ansible.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ stdout_callback = yaml
callbacks_enabled = profile_tasks
forks = 50
retry_files_enabled = False
timeout = 10
timeout = 600
connection = ssh
pipelining = True
host_key_checking = False
Expand Down
18 changes: 14 additions & 4 deletions infrastructure/nomad/playbooks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -627,9 +627,13 @@

- name: Deploy Jobs
ansible.builtin.shell: |
set -x
START_TIME="$(date +%s)"
nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad
END_TIME="$(date +%s)"
echo "Deployment of {{ job.name }} took $(date -ud "@$((END_TIME - START_TIME))" +'%H:%M:%S')"
TIMEOUT={% if profile == 'ci' %}600{% else %}300{% endif %}
TIMEOUT=600
START_TIME=$(date +%s)
RESULT=$(nomad job status -json "{{ job.name }}")
if [ $? -ne 0 ]; then
Expand All @@ -641,6 +645,11 @@
while true; do
STATUS=$(echo "${RESULT}" | jq -r '.[0].Allocations[0].ClientStatus')
if [ "${STATUS}" = "failed" ]; then
echo "Deployment failed for {{ job.name }}:"
echo "${RESULT}"
exit 1
fi
case "${JOB_TYPE}" in
service)
Expand All @@ -666,7 +675,7 @@
CURRENT_TIME="$(date +%s)"
ELAPSED_TIME="$(( CURRENT_TIME - START_TIME ))"
if [ ${ELAPSED_TIME} -ge ${TIMEOUT} ]; then
echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}"
echo "Deployment timed out for {{ job.name }}, current status: ${STATUS}"
exit 1
fi
Expand All @@ -685,8 +694,9 @@
label: "{{ item.name }}"
vars:
job: "{{ item }}"
register: result
failed_when: result.rc != 0
register: job_result
failed_when: job_result.rc != 0
when: job_result is not defined or job_result.rc == 0

- name: Post Deployment Info
ansible.builtin.debug:
Expand Down
10 changes: 10 additions & 0 deletions infrastructure/nomad/playbooks/templates/jobs/artifacts.nomad.j2
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ job "{% if env != 'devenv' %}{{ environments[env].version }}{% else %}artifacts-
group "artifacts-group" {
count = 1

restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,32 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

# This is a special case for CI because the runner machine is not very
# powerful and compiling and deploying contracts can take a long time.
{% if env == 'devenv' and profile == 'ci' %}
update {
healthy_deadline = "20m"
progress_deadline = "30m"
}
{% else %}
update {
healthy_deadline = "10m"
progress_deadline = "15m"
}
{% endif %}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
dns {
servers = {{ (ansible_facts['dns']['nameservers'] + ['1.1.1.1']) | tojson }}
Expand All @@ -31,6 +57,14 @@ job "{{ job.name }}" {
port = "{{ port_name }}"
tags = ["{{ port_name }}"]
provider = "nomad"

check {
type = "http"
path = "/"
port = "{{ port_name }}"
interval = "10s"
timeout = "5s"
}
}
{% endfor %}

Expand Down Expand Up @@ -145,7 +179,7 @@ job "{{ job.name }}" {
echo "Failed to transfer ownership!"
exit 1
fi
echo "Ownership transfered successfully."
echo "Ownership transferred successfully."

python3 -m http.server {{ job.ports[0]['http']['static'] }} --directory /local/www
# endtodo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
dns {
servers = {{ (ansible_facts['dns']['nameservers'] + ['1.1.1.1']) | tojson }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down Expand Up @@ -87,16 +99,15 @@ job "{{ job.name }}" {
mkdir -p /var/run/postgresql > /dev/null 2>&1
pg_ctl initdb --silent --pgdata="${POSTGRES_DATA}"
if [ $? -ne 0 ]; then
echo "Failed to initialize database."
echo "Failed to initialize PostgreSQL."
exit 1
fi
cp /alloc/data/postgres.env "${POSTGRES_DATA}/.env"
postgres -D "${POSTGRES_DATA}" &
pid=$!

if ! timeout 5m bash -c 'until pg_ctl status --pgdata="${POSTGRES_DATA}" --silent --no-wait; do sleep 1; done'; then
echo "Waiting for PostgreSQL to start..."
sleep 1
pg_ctl start --pgdata="${POSTGRES_DATA}" --silent --wait --timeout=300 > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Failed to start PostgreSQL."
exit 1
fi

createuser --superuser postgres > /dev/null 2>&1
Expand All @@ -109,7 +120,13 @@ job "{{ job.name }}" {
GRANT ALL PRIVILEGES ON DATABASE ${POSTGRES_DB} TO ${POSTGRES_USERNAME};"
echo "Database initialized and configured successfully."

wait $pid
pg_ctl stop --pgdata="${POSTGRES_DATA}" --silent --wait --timeout=300 > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Failed to stop PostgreSQL."
exit 1
fi

postgres -D "${POSTGRES_DATA}"
{% endraw %}
EOH
destination = "local/run.sh"
Expand Down Expand Up @@ -304,6 +321,11 @@ job "{{ job.name }}" {
export MEV_ORACLE_PG_PASSWORD="${POSTGRES_PASSWORD}"
export MEV_ORACLE_PG_DBNAME="${POSTGRES_DB}"

if ! timeout 5m bash -c 'until pg_isready -h ${MEV_ORACLE_PG_HOST} -p ${MEV_ORACLE_PG_PORT} -U ${MEV_ORACLE_PG_USER} -d ${MEV_ORACLE_PG_DBNAME}; do sleep 1; done'; then
echo "Waiting for PostgreSQL to start..."
sleep 1
fi

chmod +x local/mev-commit-oracle
local/mev-commit-oracle start
{% endraw %}
Expand Down
12 changes: 12 additions & 0 deletions infrastructure/nomad/playbooks/templates/jobs/mev-commit.nomad.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ job "{{ job.name }}" {
group "{{ job.name }}-group" {
count = {{ job.count }}

{% if env == 'devenv' %}
restart {
attempts = 0
mode = "fail"
}

reschedule {
attempts = 0
unlimited = false
}
{% endif %}

network {
mode = "bridge"

Expand Down
2 changes: 0 additions & 2 deletions p2p/examples/provideremulator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ func main() {
return
}

fmt.Printf("connected to provider %s, receiving bids...\n", *serverAddr)

for {
select {
case bid, more := <-bidS:
Expand Down
2 changes: 0 additions & 2 deletions p2p/integrationtest/provider/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,6 @@ func main() {
return
}

fmt.Printf("connected to provider %s, receiving bids...\n", *serverAddr)

for bid := range bidS {
receivedBids.Inc()
buf, err := json.Marshal(bid)
Expand Down

0 comments on commit d40be37

Please sign in to comment.