Skip to content

infrastructure

infrastructure #292

name: infrastructure
on:
workflow_call:
workflow_dispatch:
inputs:
profile:
description: 'Profile'
type: choice
options:
- devnet
- testnet
- stressnet
- manual-test
default: 'devnet'
all_targets:
description: 'All Arch & Os Targets'
type: boolean
default: false
debug:
description: 'Debug Deployment'
type: boolean
default: false
logs:
description: 'Collect Logs'
type: boolean
default: false
target_machine:
description: 'Target Machine'
type: choice
options:
- lax1
- chi1
- mia2
- mia3
default: 'lax'
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event.inputs.target_machine }}
cancel-in-progress: true
jobs:
cluster:
name: Setup and Test Nomad Cluster
runs-on: ubuntu-24.04
timeout-minutes: 180
steps:
- name: Print System Information
run: |
echo "CPU INFO:"
lscpu
echo
echo "MEMORY INFO:"
free -h
- name: Setup Environment
run: |
RUNNER_START_TIME="$(date +%s)"
echo "${{ secrets.INFRASTRUCTURE_DNS_RECORDS }}" | sudo tee -a /etc/hosts
sudo systemctl restart systemd-resolved
IS_MANUAL_DEPLOYMENT=$([ "${{ github.event_name }}" == "workflow_dispatch" ] && echo true || echo false)
CLUSTER_ENVIRONMENT_FLAG="--environment devenv"
CLUSTER_PROFILE_FLAG=$([ "${IS_MANUAL_DEPLOYMENT}" == "true" ] && echo "--profile ${{ github.event.inputs.profile }}" || echo "--profile ci")
CLUSTER_LOGS_FLAG=$([ "${{ github.event.inputs.logs }}" == "false" ] && echo "--no-logs-collection" || echo "")
CLUSTER_DATADOG_KEY_FLAG=$([ "${IS_MANUAL_DEPLOYMENT}" == "true" ] && echo "--datadog-key ${{ secrets.DATADOG_API_KEY }}" || echo "")
CLUSTER_L1_RPC_URL_FLAG="--l1-rpc-url ${{ secrets.L1_RPC_URL }}"
CLUSTER_DEBUG_FLAG=$([ "${{ github.event.inputs.debug }}" == "true" ] && echo "--debug" || echo "")
TARGET_MACHINE_IP=$([ "${IS_MANUAL_DEPLOYMENT}" == "true" ] && echo "$(dig +short ${{ github.event.inputs.target_machine }})" || echo "127.0.0.1")
echo "RUNNER_START_TIME=${RUNNER_START_TIME}" >> ${GITHUB_ENV}
echo "IS_MANUAL_DEPLOYMENT=${IS_MANUAL_DEPLOYMENT}" >> ${GITHUB_ENV}
echo "CLUSTER_ENVIRONMENT_FLAG=${CLUSTER_ENVIRONMENT_FLAG}" >> ${GITHUB_ENV}
echo "CLUSTER_PROFILE_FLAG=${CLUSTER_PROFILE_FLAG}" >> ${GITHUB_ENV}
echo "CLUSTER_LOGS_FLAG=${CLUSTER_LOGS_FLAG}" >> ${GITHUB_ENV}
echo "CLUSTER_DATADOG_KEY_FLAG=${CLUSTER_DATADOG_KEY_FLAG}" >> ${GITHUB_ENV}
echo "CLUSTER_L1_RPC_URL_FLAG=${CLUSTER_L1_RPC_URL_FLAG}" >> ${GITHUB_ENV}
echo "CLUSTER_DEBUG_FLAG=${CLUSTER_DEBUG_FLAG}" >> ${GITHUB_ENV}
echo "TARGET_MACHINE_IP=${TARGET_MACHINE_IP}" >> ${GITHUB_ENV}
- name: Notify - Deployment Initialized
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'true' }}
run: |
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
PAYLOAD=$(cat <<-EOH
{
"text": "<@${{ github.actor }}> - deployment to <http://${TARGET_MACHINE_IP}:4646/ui|*${{ github.event.inputs.target_machine }}*> has been initialized",
"attachments": [
{
"color": "#0000FF",
"fields": [
{
"title": "Workflow",
"value": "<${WORKFLOW_URL}|View Workflow Run>",
"short": false
},
]
}
]
}
EOH
)
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
- name: Checkout Code
uses: actions/checkout@v4
with:
submodules: recursive
ref: ${{ github.event.workflow_run.head_branch || github.event.inputs.branch || github.ref }}
- name: Setup Cache
uses: actions/cache@v4
with:
path: |
~/go/pkg/mod
~/.cache/go-build
~/.local/pipx
/usr/local/bin/goreleaser
key: ${{ runner.os }}-go-${{ hashFiles('**/go.work.sum') }}
restore-keys: ${{ runner.os }}-go-
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: 1.22
check-latest: true
cache-dependency-path: go.work.sum
- name: Install Required Dependencies
if: steps.cache.outputs.cache-hit != 'true'
run: |
echo 'deb [trusted=yes] https://repo.goreleaser.com/apt/ /' | sudo tee /etc/apt/sources.list.d/goreleaser.list
sudo add-apt-repository --yes ppa:ethereum/ethereum
sudo apt-get update
sudo apt-get install --yes goreleaser ethereum
python3 -m venv primevenv
source primevenv/bin/activate
pip install boto3 botocore
pipx inject ansible-core botocore boto3
- name: Configure Control Machine
run: |
ANSIBLE_USER="${USER}"
ANSIBLE_CONNECTION="ansible_connection=local"
if [ "${IS_MANUAL_DEPLOYMENT}" == "true" ]; then
ANSIBLE_USER="ubuntu"
ANSIBLE_CONNECTION=""
export ANSIBLE_HOST_KEY_CHECKING=false
mkdir -p ~/.ssh
chmod 700 ~/.ssh
echo "${{ secrets.INFRASTRUCTURE_DEPLOYMENT_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
fi
aws configure set aws_access_key_id ${{ secrets.AWS_ACCESS_KEY_ID }}
aws configure set aws_secret_access_key ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws configure set default.region us-west-2
cp ansible.cfg.example ansible.cfg
cat <<-EOH > hosts.ini
[nomad_servers]
${TARGET_MACHINE_IP} ${ANSIBLE_CONNECTION} ansible_user=${ANSIBLE_USER}
[nomad_clients]
${TARGET_MACHINE_IP} ${ANSIBLE_CONNECTION} ansible_user=${ANSIBLE_USER}
EOH
STDOUT="$(ansible all -o -m command -a 'uname -sm' 2>&1)"
if [ $? -ne 0 ]; then
echo "Unable to connect to target machine: ${STDOUT}"
exit 1
fi
if [ "${IS_MANUAL_DEPLOYMENT}" == "false" ] || [ "${{ github.event.inputs.all_targets }}" == "false" ]; then
STDOUT="$(echo "${STDOUT}" | awk -F 'stdout\\) ' '{print $2}')"
OS="$(echo "${STDOUT}" | awk '{print $1}' | tr '[:upper:]' '[:lower:]')"
ARCH="$(echo "${STDOUT}" | awk '{print $2}' | tr '[:upper:]' '[:lower:]')"
[ "${ARCH}" == "x86_64" ] && ARCH="amd64"
echo "ARTIFACTS_GOOS=${OS}" >> ${GITHUB_ENV}
echo "ARTIFACTS_GOARCH=${ARCH}" >> ${GITHUB_ENV}
fi
working-directory: infrastructure/nomad
- name: Destroy Existing Cluster
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'true' }}
run: |
./cluster.sh destroy ${CLUSTER_DEBUG_FLAG}
working-directory: infrastructure/nomad
- name: Initialize Cluster
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' }}
run: |
START_TIME="$(date +%s)"
./cluster.sh init ${CLUSTER_ENVIRONMENT_FLAG} ${CLUSTER_DEBUG_FLAG}
END_TIME="$(date +%s)"
echo "INIT_DURATION=$(date -ud "@$((END_TIME - START_TIME))" +'%H:%M:%S')" >> ${GITHUB_ENV}
working-directory: infrastructure/nomad
- name: Deploy Cluster
run: |
START_TIME="$(date +%s)"
./cluster.sh deploy ${CLUSTER_ENVIRONMENT_FLAG} ${CLUSTER_PROFILE_FLAG} ${CLUSTER_LOGS_FLAG} ${CLUSTER_DATADOG_KEY_FLAG} ${CLUSTER_L1_RPC_URL_FLAG} ${CLUSTER_DEBUG_FLAG}
END_TIME="$(date +%s)"
echo "DEPLOY_DURATION=$(date -ud "@$((END_TIME - START_TIME))" +'%H:%M:%S')" >> ${GITHUB_ENV}
working-directory: infrastructure/nomad
- name: Notify - Deployment Successful
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'true' && success() }}
run: |
DEPLOY_DURATION=${DEPLOY_DURATION:-N/A}
RUNNER_DURATION=$(date -ud "@$(( $(date +%s) - ${RUNNER_START_TIME} ))" +'%H:%M:%S')
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
PAYLOAD=$(cat <<-EOH
{
"text": "<@${{ github.actor }}> - deployment to <http://${TARGET_MACHINE_IP}:4646/ui|*${{ github.event.inputs.target_machine }}*> was successful",
"attachments": [
{
"color": "#00FF00",
"fields": [
{
"title": "Workflow",
"value": "<${WORKFLOW_URL}|View Workflow Run>",
"short": false
},
{
"title": "Deployment Duration",
"value": "Deploy: ${DEPLOY_DURATION}\nRunner: ${RUNNER_DURATION}",
"short": false
}
]
}
]
}
EOH
)
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
- name: Notify - Deployment Failed
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'true' && failure() }}
run: |
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
PAYLOAD=$(cat <<-EOH
{
"text": "<@${{ github.actor }}> - deployment to <http://${TARGET_MACHINE_IP}:4646/ui|*${{ github.event.inputs.target_machine }}*> has failed",
"attachments": [
{
"color": "#FF0000",
"fields": [
{
"title": "Workflow",
"value": "<${WORKFLOW_URL}|View Workflow Run>",
"short": false
},
]
}
]
}
EOH
)
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
- name: Collect Cluster Logs
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
NOMAD_SERVER="http://${TARGET_MACHINE_IP}:4646"
journalctl -u nomad > nomad.log
curl -s ${NOMAD_SERVER}/v1/jobs > nomad_jobs.json
ALLOC_IDS=$(curl -s ${NOMAD_SERVER}/v1/allocations | jq -r '.[].ID')
for ALLOC_ID in ${ALLOC_IDS}; do
JOB=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.JobID')
TASKS=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.TaskStates | keys[]')
for TASK in ${TASKS}; do
STDOUT=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stdout")
if [ "$(jq -e .Data <<< "${STDOUT}" 2> /dev/null)" != "null" ]; then
echo ${STDOUT} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stdout.log"
else
echo "Failed to fetch stdout log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDOUT}
fi
STDERR=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stderr")
if [ "$(jq -e .Data <<< "${STDERR}" 2> /dev/null)" != "null" ]; then
echo ${STDERR} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stderr.log"
else
echo "Failed to fetch stderr log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDERR}
fi
done
done
- name: Upload Debug Artifacts
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: debug-artifacts
path: |
/tmp/dist/
nomad_jobs.json
nomad.log
*_stdout.log
*_stderr.log
- name: Initialize Debug Shell
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
TUNSHELL_KEYS=$(curl -sSf -X POST https://eu.relay.tunshell.com/api/sessions)
DEBUG_SHELL="sh <(curl -sSf https://lets.tunshell.com/init.sh) L $(echo ${TUNSHELL_KEYS} | jq -r .peer2_key) \${TUNSHELL_SECRET} eu.relay.tunshell.com"
WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
PR_TITLE="$(jq -r .head_commit.message <<< '${{ toJson(github.event.workflow_run) }}')"
PR_URL="$(jq -r '.head_commit.url // ""' <<< '${{ toJson(github.event.workflow_run) }}')"
PAYLOAD=$(cat <<-EOH
{
"text": "<@${{ github.actor }}> infrastructure workflow has failed:",
"attachments": [
{
"color": "#FF0000",
"fields": [
{
"title": "Workflow",
"value": "<${WORKFLOW_URL}|View Workflow Run>",
"short": false
},
$( [ -n "$PR_URL" ] && cat <<-PULL_REQUEST
{
"title": "Pull Request",
"value": "<${PR_URL}|${PR_TITLE}>",
"short": false
},
PULL_REQUEST
)
{
"title": "Debug Shell",
"value": "\`\`\`${DEBUG_SHELL}\`\`\`",
"short": false
}
]
}
]
}
EOH
)
echo "Debug Shell: ${DEBUG_SHELL}"
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
curl -sSf https://lets.tunshell.com/init.sh | sh -s -- T $(echo ${TUNSHELL_KEYS} | jq -r .peer1_key) ${{ secrets.TUNSHELL_SECRET }} eu.relay.tunshell.com