diff --git a/.github/workflows/e2e-slurm.yml b/.github/workflows/e2e-slurm.yml new file mode 100644 index 00000000..e98bc090 --- /dev/null +++ b/.github/workflows/e2e-slurm.yml @@ -0,0 +1,30 @@ +--- +name: Slurm + +on: [push] +jobs: + e2e-slurm: + name: Test e2e with SLURM + runs-on: ubuntu-latest + steps: + - name: checkout our repo + uses: actions/checkout@v4 + - name: Install apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.1.2 + - name: Install Conda + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: babs + auto-update-conda: true + python-version: 3.9 + - name: Conda info + shell: bash -el {0} + run: conda info + - name: Install Babs + shell: bash -el {0} + run: ./tests/e2e-slurm/install-babs.sh + - name: Execute e2e with SLURM + shell: bash -el {0} + run: ./tests/e2e-slurm/main.sh diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 00000000..2cc56d4d --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,24 @@ +name: Shellcheck scripts + +on: [push, pull_request] + +jobs: + test: + + runs-on: ubuntu-latest + + steps: + - name: Set up system + shell: bash + run: | + sudo apt-get update -qq + sudo apt-get install shellcheck + - uses: actions/checkout@v4 + - name: Run shellcheck + run: | + shellcheck \ + tests/e2e-slurm/container/babs-user-script.sh \ + tests/e2e-slurm/container/ensure-env.sh \ + tests/e2e-slurm/container/walkthrough-tests.sh \ + tests/e2e-slurm/install-babs.sh \ + tests/e2e-slurm/main.sh diff --git a/.gitignore b/.gitignore index 18792aa0..de79a6da 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ build/ # Distribution / packaging dist/ -babs/VERSION \ No newline at end of file +babs/VERSION diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..a6143630 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +install: + ./tests/e2e-slurm/install-babs.sh + +setup-user: + ./tests/e2e-slurm/setup-user.sh + +e2e: clean + ./tests/e2e-slurm/main.sh + +clean: + @ podman stop slurm 2>/dev/null || true + @ podman rm slurm 2>/dev/null || true + @[ -e .testdata/babs_test_project/toybidsapp-container ] && \ + datalad remove -d .testdata/babs_test_project/toybidsapp-container --reckless kill || : + rm -rf .testdata diff --git a/babs/utils.py b/babs/utils.py index e6004e3d..d8103475 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -1685,7 +1685,13 @@ def submit_one_test_job(analysis_path, type_system, flag_print_message=True): # e.g., on MIT OpenMind: no 1st line from MSI; only 2nd line. else: raise Exception("type system can be slurm or sge") - job_id = int(job_id_str) + + # This is necessary SLURM commands can fail but have return code 0 + try: + job_id = int(job_id_str) + except ValueError as e: + raise ValueError(f"Cannot convert {job_id_str!r} into an int: {e}. " + f"That output is a result of running command {cmd} which produced output {msg}.") # log filename: log_filename = job_name + ".*" + job_id_str diff --git a/setup.cfg b/setup.cfg index 05f6ea70..4c167ce1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,17 +24,18 @@ classifiers = [options] python_requires = >=3.7 install_requires = + backoff + datalad >= 0.17.2 + datalad_container >= 1.1.6 + filelock >= 3.8.0 nibabel >=2.2.1 numpy pandas - tqdm pyyaml >= 6.0 - #ruamel.yaml >= 0.17.21 - datalad >= 0.17.2 - datalad_container >= 1.1.6 - regex - filelock >= 3.8.0 qstat >= 0.0.5 + regex + #ruamel.yaml >= 0.17.21 + tqdm packages = find: include_package_data = True diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh new file mode 100755 index 00000000..1a484a93 --- /dev/null +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -0,0 +1,117 @@ +#!/bin/bash -i + +SUBPROJECT_NAME=test_project + +set -eu + +echo "==============================================================" +echo "We are now running as user $(whoami)" +echo "DEBUG: MINICONDA_PATH=${MINICONDA_PATH}" +echo "DEBUG: TESTDATA=${TESTDATA}" + +# without MINICONDA_PATH set, shellcheck cannot follow +# shellcheck disable=SC1091 +source "$MINICONDA_PATH/etc/profile.d/conda.sh" +conda activate babs + +# record the miniconda path so it can added to the test env (slurm jobs do not preserve env) +cat > /home/"$USER"/miniconda.env << EOF +. "$MINICONDA_PATH/etc/profile.d/conda.sh" +EOF + + + +git config --global user.name "e2e testuser" +git config --global user.email "testuser@example.com" +echo "Git user: $(git config user.name)" +echo "Git email: $(git config user.email)" + +# TODO switch back to osf project +# Populate input data (Divergent from tuturial, bc https://github.com/datalad/datalad-osf/issues/191 +pushd "${TESTDATA}" +echo "Installing Input Data" +datalad install ///dbic/QA + +# Singularity image created by root, then chowned to this user, and datalad must be run as this user +datalad create -D "toy BIDS App" toybidsapp-container +pushd toybidsapp-container +datalad containers-add \ + --url "${PWD}/../toybidsapp-0.0.7.sif" \ + toybidsapp-0-0-7 +popd +rm -f toybidsapp-0.0.7.sif + + +# TODO File Issue: --where_project must be abspath file issue for relative path +babs-init \ + --where_project "${PWD}" \ + --project_name $SUBPROJECT_NAME \ + --input BIDS "${PWD}"/QA \ + --container_ds "${PWD}"/toybidsapp-container \ + --container_name toybidsapp-0-0-7 \ + --container_config_yaml_file "${PWD}"/config_toybidsapp.yaml \ + --type_session multi-ses \ + --type_system slurm + +echo "PASSED: babs-init" +echo "Check setup, without job" +babs-check-setup --project_root "${PWD}"/test_project/ +echo "PASSED: Check setup, without job" + +babs-check-setup --project_root "${PWD}"/test_project/ --job-test +echo "Job submitted: Check setup, with job" + +babs-status --project_root "${PWD}"/test_project/ + +# Wait for all running jobs to finish +while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u \"$USER\" -t RUNNING,PENDING" + squeue -u "$USER" -t RUNNING,PENDING + echo "Waiting for running jobs to finish..." + sleep 5 # Wait for 60 seconds before checking again +done + +echo "No running jobs." + +# TODO make sure this works +# Check for failed jobs TODO state filter doesn't seem to be working as expected +# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then +if sacct -u "$USER" --noheader | grep -q "FAILED"; then + sacct -u "$USER" + echo "There are failed jobs." + exit 1 # Exit with failure status +else + sacct -u "$USER" + echo "PASSED: No failed jobs." +fi + +babs-submit --project-root "${PWD}/test_project/" + +# # Wait for all running jobs to finish +while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u \"$USER\" -t RUNNING,PENDING" + squeue -u "$USER" -t RUNNING,PENDING + echo "Waiting for running jobs to finish..." + sleep 5 # Wait for 60 seconds before checking again +done + +echo "=========================================================================" +echo "babs-status:" +babs-status --project_root "${PWD}"/test_project/ +echo "=========================================================================" + +# Check for failed jobs TODO see above +# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then +if sacct -u "$USER" --noheader | grep -q "FAILED"; then + sacct -u "$USER" + echo "=========================================================================" + echo "There are failed jobs." + exit 1 # Exit with failure status +else + sacct -u "$USER" + echo "=========================================================================" + echo "PASSED: No failed jobs." +fi + +babs-merge --project_root "${PWD}"/test_project/ +echo "PASSED: e2e walkthrough successful!" diff --git a/tests/e2e-slurm/container/config_toybidsapp.yaml b/tests/e2e-slurm/container/config_toybidsapp.yaml new file mode 100644 index 00000000..b4839956 --- /dev/null +++ b/tests/e2e-slurm/container/config_toybidsapp.yaml @@ -0,0 +1,21 @@ +# Arguments in `singularity run`: +singularity_run: + --no-zipped: "" + --dummy: "2" + -v: "" + +# Output foldername(s) to be zipped, and the BIDS App version to be included in the zip filename(s): +zip_foldernames: + toybidsapp: "0-0-7" + +# How much cluster resources it needs: +cluster_resources: + interpreting_shell: /bin/bash + hard_memory_limit: 2G + +script_preamble: | + . ~/miniconda.env + conda activate babs + +# Where to run the jobs: +job_compute_space: "/tmp" diff --git a/tests/e2e-slurm/container/ensure-env.sh b/tests/e2e-slurm/container/ensure-env.sh new file mode 100755 index 00000000..352ed3a7 --- /dev/null +++ b/tests/e2e-slurm/container/ensure-env.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# +if [ -z "${MINICONDA_PATH:-}" ]; then + if hash conda; then + # We don't need the return value, we already catch the error + # shellcheck disable=SC2155 + export MINICONDA_PATH=$(/bin/which conda | xargs dirname | xargs dirname) + else + echo "ERROR: must have MINICONDA_PATH set or have 'conda' available" + exit 1 + fi +fi diff --git a/tests/e2e-slurm/container/walkthrough-tests.sh b/tests/e2e-slurm/container/walkthrough-tests.sh new file mode 100755 index 00000000..1c57f2d9 --- /dev/null +++ b/tests/e2e-slurm/container/walkthrough-tests.sh @@ -0,0 +1,50 @@ +#!/bin/bash -i +# Here we perform all actions that must be done as root inside the container and then +# execute the walkthrough as BABS_USER +set -eu + +export TESTDATA=/opt/testdata +BABS_USER=testuser + +# Install singularity inside the container +yum update -y && yum install -y epel-release && yum update -y && yum install -y singularity-runtime apptainer + +# Wait for slurm to be up +max_retries=10 +delay=10 # seconds + +echo "Try connecting to slurm with sacct until it succeeds" +set +e # We need to check the error code and allow failures until slurm has started up +export PATH=${PWD}/tests/e2e-slurm/bin/:${PATH} +for ((i=1; i<=max_retries; i++)); do + # Check if the command was successful + if sacct > /dev/null; then + echo "Slurm is up and running!" + break + else + echo "Waiting for Slurm to start... retry $i/$max_retries" + sleep $delay + fi + # exit if max retries reached + if [ "$i" -eq "$max_retries" ]; then + echo "Failed to start Slurm after $max_retries attempts." + exit 1 + fi +done +set -e + +# Currently we are root inside the container. Now we create a user to own the testdata +useradd "$BABS_USER" +# cp rather than use bind directly so it can be owned by the container user and not cause issues outside +mkdir "${TESTDATA}" +cp /opt/outer/* "${TESTDATA}" + + +# We build the singularity container now while we are root, and use it later as testuser +pushd "${TESTDATA}" +singularity build \ + toybidsapp-0.0.7.sif \ + docker://pennlinc/toy_bids_app:0.0.7 + +chown -R "$BABS_USER:$BABS_USER" "${TESTDATA}" +su "${BABS_USER}" "${TESTDATA}/babs-user-script.sh" diff --git a/tests/e2e-slurm/install-babs.sh b/tests/e2e-slurm/install-babs.sh new file mode 100755 index 00000000..a0674743 --- /dev/null +++ b/tests/e2e-slurm/install-babs.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -eu + +. tests/e2e-slurm/container/ensure-env.sh + +conda install -c conda-forge datalad git git-annex -y + +# Optional dependencies, required for e2e-slurm +pip install datalad_container +pip install datalad-osf + +pip install . diff --git a/tests/e2e-slurm/main.sh b/tests/e2e-slurm/main.sh new file mode 100755 index 00000000..f4e35a4b --- /dev/null +++ b/tests/e2e-slurm/main.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +set -eux + +# Expects: Conda env to be activated +# Expects: Babs to be installed +# +# WIP-NOT-WORKING +# Reminder :Z for selinux + +# TODO switch back to upstream after build +# Currently using asmacdo, OpenSSL bump upstream, but no new docker build +# https://github.com/giovtorres/docker-centos7-slurm/pull/49 +REGISTRY=docker.io +HUBUSER=asmacdo +# HUBUSER=giovtorres +REPO=centos7-slurm +# REPO=docker-centos7-slurm +TAG=23.11.07 # TODO + +FQDN_IMAGE=${REGISTRY}/${HUBUSER}/${REPO}:${TAG} +THIS_DIR="$(readlink -f "$0" | xargs dirname )" + +# Sets MINICONDA_PATH +. tests/e2e-slurm/container/ensure-env.sh + +if [ "$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin/babs-init" != "$(which babs-init)" ]; then + echo "Error: This script expects to be run inside a conda env with 'babs-init'!" >&2 + echo " We have not found it in conda env '$CONDA_DEFAULT_ENV' under '$MINICONDA_PATH'" >&2 + exit 1 +fi + +echo "Success, we are in the conda env with babs-init!" + +# PWD shared so babs can be optionally be installed with develop install +podman run -it --rm \ + --name slurm \ + --hostname slurmctl \ + -e "MINICONDA_PATH=${MINICONDA_PATH}" \ + --privileged \ + -v "${PWD}:${PWD}:ro,Z" \ + -v "${MINICONDA_PATH}:${MINICONDA_PATH}:Z" \ + -v "${THIS_DIR}/container:/opt/outer:ro,Z" \ + "${FQDN_IMAGE}" \ + /bin/bash -c ". /opt/outer/walkthrough-tests.sh"