From 21956e3c41871f1325773c83eb85c91e2bb5fa30 Mon Sep 17 00:00:00 2001 From: Timo Wilken Date: Wed, 19 Oct 2022 18:00:23 +0200 Subject: [PATCH] Clean up running CI processes when killed This should make restarting CI processes on Mac much more reliable, as Nomad cannot reliably kill the whole tree itself there. This should also solve the annoyance of CI jobs leaving behind running build containers when they are killed. The behaviour relies on any long-running processes in the CI scripts being wrapped in a short_timeout or long_timeout call, which I think they all are. --- ci/build-helpers.sh | 21 ++++++++++++++++++++- ci/build-loop.sh | 2 +- ci/continuous-builder.sh | 17 +++++++++++++---- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/ci/build-helpers.sh b/ci/build-helpers.sh index 3069fd9f..6d5337f9 100755 --- a/ci/build-helpers.sh +++ b/ci/build-helpers.sh @@ -195,7 +195,13 @@ function short_timeout () { general_timeout "$TIMEOUT" "$@"; } function long_timeout () { general_timeout "$LONG_TIMEOUT" "$@"; } function general_timeout () { local ret=0 short_cmd - $timeout_exec -s9 "$@" || ret=$? + # Beware: backgrounding $timeout_exec means that it cannot read stdin! + $timeout_exec -s9 "$@" & + # `timeout` makes a new process group. If we're killed now, also kill it. + # shellcheck disable=SC2064 # expanding $! now is intentional + trap "kill $!; trap_handler" INT TERM + wait || ret=$? + trap trap_handler INT TERM # reset traps once `timeout` has exited # 124 if command times out; 137 if command is killed (including by timeout itself) if [ $ret -eq 124 ] || [ $ret -eq 137 ]; then # BASH_{SOURCE,LINENO}[0] is where we're being called from, which is inside @@ -212,3 +218,16 @@ function general_timeout () { fi return $ret } + +# Set this function to be invoked when killed using `trap trap_handler INT TERM`. +# When killed, make sure we exit cleanly by killing all subshells and processes +# in our process group. Long-running commands need special handling as commands +# are invoked by bash in their own process group; see the general_timeout +# function for details. +function trap_handler () { + # Reset trap handler so we don't recurse forever when killing ourselves below. + trap - INT TERM + # Kill everything in our process group (i.e. subshells that bash spawned). + # This does *not* include subprocesses run by bash, only subshells. + kill -- -$$ +} diff --git a/ci/build-loop.sh b/ci/build-loop.sh index 3c15a5eb..3e5ea81f 100755 --- a/ci/build-loop.sh +++ b/ci/build-loop.sh @@ -41,7 +41,7 @@ echo "$HASHES" | tail -n "+$((BUILD_SEQ + 1))" | cat -n | while read -r ahead bt # API request quota too quickly. succeeded) ;; esac - ) < /dev/null # Stop commands from slurping hashes, just in case. + ) & wait # Stop commands from slurping hashes; make "trap" work. done case "$BUILD_TYPE" in diff --git a/ci/continuous-builder.sh b/ci/continuous-builder.sh index 5b5a07e8..4e074e1f 100755 --- a/ci/continuous-builder.sh +++ b/ci/continuous-builder.sh @@ -6,6 +6,10 @@ . build-helpers.sh +# Make sure we exit cleanly and don't leave leftover processes running when +# killed. Bash doesn't do this automatically, unfortunately. +trap trap_handler INT TERM + if [ "$1" != --skip-setup ]; then if [ -r ~/.continuous-builder ]; then # Tell ShellCheck not to check the sourced file here. Assume the .env files are fine. @@ -107,9 +111,12 @@ if [ -n "$HASHES" ]; then # Run the build . build-loop.sh - # Something inside this subshell is reading stdin, which causes some hashes - # from above to be ignored. It shouldn't, so redirect from /dev/null. - ) < /dev/null + # Something inside this subshell is reading stdin, which used to cause + # some hashes from above to be ignored. Let "&" redirect from /dev/null. + # Additionally, use "& wait" so that the trap set above can run properly. + # If we just ran the subshell normally, bash would wait for it to exit + # before invoking the trap handler, which would take too long. + ) & wait # Outside subshell; we're back in the parent directory. # Delete builds older than 2 days, then keep deleting until we've got at @@ -161,7 +168,9 @@ run_duration=$(($(date +%s) - run_start_time)) # shellcheck disable=SC2031 # TIMEOUT was modified in a subshell; that's fine. timeout=$(get_config_value timeout "${TIMEOUT:-120}") if [ "$run_duration" -lt "$timeout" ]; then - sleep $((timeout - run_duration)) || : + # Wrap this in short_timeout so that sleep gets killed properly when the outer + # script is killed. + TIMEOUT=$timeout short_timeout sleep $((timeout - run_duration)) fi # Re-exec ourselves. This lets us update pick up updates to this script, e.g.