Skip to content

Commit

Permalink
feat: collect infrastructure cluster logs on failure (#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
mrekucci authored Jul 5, 2024
1 parent db6f3b7 commit 1abbbfd
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 19 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/infrastructure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,48 @@ jobs:
)
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
- name: Collect Cluster Logs
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
NOMAD_SERVER="http://${TARGET_MACHINE_IP}:4646"
journalctl -u nomad > nomad.log
curl -s ${NOMAD_SERVER}/v1/jobs > nomad_jobs.json
ALLOC_IDS=$(curl -s ${NOMAD_SERVER}/v1/allocations | jq -r '.[].ID')
for ALLOC_ID in ${ALLOC_IDS}; do
JOB=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.JobID')
TASKS=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.TaskStates | keys[]')
for TASK in ${TASKS}; do
STDOUT=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stdout")
if [ "$(jq -e .Data <<< "${STDOUT}" 2> /dev/null)" != "null" ]; then
echo ${STDOUT} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stdout.log"
else
echo "Failed to fetch stdout log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDOUT}
fi
STDERR=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stderr")
if [ "$(jq -e .Data <<< "${STDERR}" 2> /dev/null)" != "null" ]; then
echo ${STDERR} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stderr.log"
else
echo "Failed to fetch stderr log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDERR}
fi
done
done
- name: Upload Cluster Logs
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
uses: actions/upload-artifact@v4
with:
name: cluster-logs
path: |
nomad_jobs.json
nomad.log
*_stdout.log
*_stderr.log
- name: Initialize Debug Shell
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
Expand Down
21 changes: 2 additions & 19 deletions infrastructure/nomad/playbooks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -571,27 +571,14 @@

- name: Deploy Jobs
ansible.builtin.shell: |
RESULT="$(nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad 2>&1)"
if [ $? -ne 0 ]; then
echo "Failed to deploy {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad
TIMEOUT={% if profile == 'ci' %}600{% else %}300{% endif %}
START_TIME=$(date +%s)
RESULT=$(nomad job status -json "{{ job.name }}")
if [ $? -ne 0 ]; then
echo "Failed to get job status for {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
JOB_TYPE=$(echo "${RESULT}" | jq -r '.[0].Allocations[0].JobType')
Expand Down Expand Up @@ -623,7 +610,7 @@
CURRENT_TIME="$(date +%s)"
ELAPSED_TIME="$(( CURRENT_TIME - START_TIME ))"
if [ ${ELAPSED_TIME} -ge ${TIMEOUT} ]; then
echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}."
echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}"
exit 1
fi
Expand All @@ -632,10 +619,6 @@
if [ $? -ne 0 ]; then
echo "Failed to get job status for {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
done
Expand Down

0 comments on commit 1abbbfd

Please sign in to comment.