From 989c817f151a5741c76e9f3de1da0502ecd8c79d Mon Sep 17 00:00:00 2001 From: Martin Pitt Date: Mon, 8 Apr 2024 04:25:07 +0200 Subject: [PATCH] tasks: Introduce elastic cloud runner mode As the EC2 bare metal instances are expensive, We don't want them to run permanently, but only in times of high demand. They should then terminate themselves when the queue is running out of work. Define "run out of work" as "the number of job-runner entries in the AMQP queue drops below 10". At that level, our permanent PSI runners can keep up. This is are more robust global criterion than checking if `run-queue` encountered an empty queue, as this is more prone to terminating only *some* of the instances while some others keep picking up brand new queue entries. Introduce an "idle poweroff" mode in which the `cockpit-tasks` main loop exits with code 100 instead of slumbering when work is running low. Configure the slice to automatically power off the machine once all cockpit-tasks instances exited cleanly (we don't want this on failures, so that we can ssh in and examine them). Use the `poweroff-immediate` heavy hammer there, to avoid potential hangs on shutdown -- there is nothing to rescue from the instance anyway. Plumb that through the AWS Ansible role and document it. --- ansible/aws/README.md | 2 ++ ansible/roles/tasks-systemd/tasks/main.yml | 1 + tasks/container/cockpit-tasks | 10 ++++++++++ tasks/install-service | 14 ++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/ansible/aws/README.md b/ansible/aws/README.md index df8c50fb..dbdd03c9 100644 --- a/ansible/aws/README.md +++ b/ansible/aws/README.md @@ -57,6 +57,8 @@ Create and configure the instance: If you run more than one at a time, set a custom host name with `-e hostname=cockpit-aws-tasks-2` or similar, so that GitHub test statuses remain useful to identify where a test runs. +There is also an "elastic" mode where the tasks bots keep running until the AMQP queue runs low. Use that for situations where AWS instances act as extra high-demand capacity instead of being the primary runners. Enable that mode with `-e idle_poweroff=1`. + Webhook setup ------------- AWS runs our primary webhook. Deploy or update it with: diff --git a/ansible/roles/tasks-systemd/tasks/main.yml b/ansible/roles/tasks-systemd/tasks/main.yml index 855c2fbc..d05afda7 100644 --- a/ansible/roles/tasks-systemd/tasks/main.yml +++ b/ansible/roles/tasks-systemd/tasks/main.yml @@ -146,4 +146,5 @@ export INSTANCES={{ instances | default(1) }} export TEST_NOTIFICATION_MX={{ notification_mx | default('') }} export TEST_NOTIFICATION_TO={{ notification_to | default('') }} + export IDLE_POWEROFF={{ idle_poweroff | default('') }} /run/install-service diff --git a/tasks/container/cockpit-tasks b/tasks/container/cockpit-tasks index af3c22db..4b64393b 100755 --- a/tasks/container/cockpit-tasks +++ b/tasks/container/cockpit-tasks @@ -20,7 +20,17 @@ function update_bots() { } # wait between 1 and 10 minutes, with an override to speed up tests +# in IDLE_POWEROFF mode, also check queue size function slumber() { + if [ -n "${IDLE_POWEROFF:-}" ]; then + # only consider job-runner entries, not statistics or webhook + NUM_JOBS=$(./inspect-queue | grep --count '"job":') + if [ "$NUM_JOBS" -lt 10 ]; then + echo "Job queue running low, exiting" + exit 100 + fi + fi + if [ -n "${SLUMBER:-}" ]; then sleep "$SLUMBER" else diff --git a/tasks/install-service b/tasks/install-service index 5de050d5..56200ab9 100755 --- a/tasks/install-service +++ b/tasks/install-service @@ -39,6 +39,9 @@ After=podman.socket [Service] Slice=cockpittasks.slice Restart=always +# cockpit-tasks exits with 100 in IDLE_POWEROFF mode when queue is running low +SuccessExitStatus=100 +RestartPreventExitStatus=100 RestartSec=60 # give image pull enough time TimeoutStartSec=10min @@ -63,6 +66,7 @@ ExecStart=/usr/bin/podman run --name=cockpit-tasks-%i --hostname=${CONTAINER_HOS --env=GIT_AUTHOR_EMAIL=cockpituous@cockpit-project.org \ --env=TEST_NOTIFICATION_MX=${TEST_NOTIFICATION_MX} \ --env=TEST_NOTIFICATION_TO=${TEST_NOTIFICATION_TO} \ + --env=IDLE_POWEROFF=${IDLE_POWEROFF:-} \ ghcr.io/cockpit-project/tasks cockpit-tasks --verbose ExecStop=/usr/bin/podman rm -f cockpit-tasks-%i @@ -70,6 +74,16 @@ ExecStop=/usr/bin/podman rm -f cockpit-tasks-%i WantedBy=multi-user.target EOF +# mode for elastic cloud runners +if [ -n "${IDLE_POWEROFF:-}" ]; then + mkdir -p /etc/systemd/system/cockpittasks.slice.d + cat < /etc/systemd/system/cockpittasks.slice.d/poweroff.conf + [Unit] + StopWhenUnneeded=yes + SuccessAction=poweroff-immediate +EOF +fi + systemctl daemon-reload for i in `seq $INSTANCES`; do systemctl enable --now cockpit-tasks@$i; done