From 8d35c613dd4082a68c5152ffb682b23880bdf4fc Mon Sep 17 00:00:00 2001 From: e2e slurm Date: Tue, 2 Jan 2024 09:30:22 -0500 Subject: [PATCH 1/6] alphabetize requirements --- setup.cfg | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 288e627a..348a7117 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,17 +24,17 @@ classifiers = [options] python_requires = >=3.7 install_requires = + datalad >= 0.17.2 + datalad_container >= 1.1.6 + filelock >= 3.8.0 nibabel >=2.2.1 numpy pandas - tqdm pyyaml >= 6.0 - #ruamel.yaml >= 0.17.21 - datalad >= 0.17.2 - datalad_container >= 1.1.6 - regex - filelock >= 3.8.0 qstat >= 0.0.5 + regex + #ruamel.yaml >= 0.17.21 + tqdm packages = find: include_package_data = True From c0d3479b75b6b1acc99ac01ecd2b17d48354680e Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Wed, 13 Dec 2023 12:22:23 -0500 Subject: [PATCH 2/6] Setup e2e tests with a slurm microcluster --- .github/workflows/e2e-slurm.yml | 30 +++++++ .gitignore | 5 +- Makefile | 22 +++++ babs/babs.py | 4 + babs/utils.py | 13 ++- setup.cfg | 1 + tests/e2e-slurm/container/babs-user-script.sh | 88 +++++++++++++++++++ .../container/config_toybidsapp.yaml | 21 +++++ tests/e2e-slurm/container/ensure-env.sh | 11 +++ tests/e2e-slurm/container/rerun.sh | 6 ++ .../e2e-slurm/container/walkthrough-tests.sh | 58 ++++++++++++ tests/e2e-slurm/install-babs.sh | 15 ++++ tests/e2e-slurm/main.sh | 50 +++++++++++ 13 files changed, 322 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/e2e-slurm.yml create mode 100644 Makefile create mode 100755 tests/e2e-slurm/container/babs-user-script.sh create mode 100644 tests/e2e-slurm/container/config_toybidsapp.yaml create mode 100755 tests/e2e-slurm/container/ensure-env.sh create mode 100755 tests/e2e-slurm/container/rerun.sh create mode 100755 tests/e2e-slurm/container/walkthrough-tests.sh create mode 100755 tests/e2e-slurm/install-babs.sh create mode 100755 tests/e2e-slurm/main.sh diff --git a/.github/workflows/e2e-slurm.yml b/.github/workflows/e2e-slurm.yml new file mode 100644 index 00000000..e98bc090 --- /dev/null +++ b/.github/workflows/e2e-slurm.yml @@ -0,0 +1,30 @@ +--- +name: Slurm + +on: [push] +jobs: + e2e-slurm: + name: Test e2e with SLURM + runs-on: ubuntu-latest + steps: + - name: checkout our repo + uses: actions/checkout@v4 + - name: Install apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.1.2 + - name: Install Conda + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: babs + auto-update-conda: true + python-version: 3.9 + - name: Conda info + shell: bash -el {0} + run: conda info + - name: Install Babs + shell: bash -el {0} + run: ./tests/e2e-slurm/install-babs.sh + - name: Execute e2e with SLURM + shell: bash -el {0} + run: ./tests/e2e-slurm/main.sh diff --git a/.gitignore b/.gitignore index 18792aa0..cf103c72 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,7 @@ build/ # Distribution / packaging dist/ -babs/VERSION \ No newline at end of file +babs/VERSION + +# e2e testdata +.testdata* diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..b0375260 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +install: + ./tests/e2e-slurm/install-babs.sh + +setup-user: + ./tests/e2e-slurm/setup-user.sh + +e2e: clean + ./tests/e2e-slurm/main.sh + +build: clean + podman build -f tests/e2e-slurm/Containerfile . -t testss + +# TODO testdata variable +clean: + podman stop slurm 2>/dev/null || true + podman rm slurm 2>/dev/null || true + [ -e .testdata/babs_test_project/toybidsapp-container ] && \ + datalad remove -d .testdata/babs_test_project/toybidsapp-container --reckless kill || : + rm -rf .testdata + +logs: + cat .testdata/ci-logs/* diff --git a/babs/babs.py b/babs/babs.py index f9719fe1..4c9025a2 100644 --- a/babs/babs.py +++ b/babs/babs.py @@ -2778,10 +2778,14 @@ def generate_job_submit_template(self, yaml_path, babs, system, test=False): env_flags = "-v DSLOCKFILE=" + babs.analysis_path + "/.SGE_datalad_lock" elif system.type == "slurm": submit_head = "sbatch" + # TODO: asmacdo env_flags = "--export=DSLOCKFILE=" + babs.analysis_path + "/.SLURM_datalad_lock" else: warnings.warn("not supporting systems other than sge...") + # TODO: rm asmacdo hack + # env_flags = env_flags + f",MINICONDA_PATH={os.getenv('MINICONDA_PATH')}" + # Check if the bash file already exist: if op.exists(yaml_path): os.remove(yaml_path) # remove it diff --git a/babs/utils.py b/babs/utils.py index e6004e3d..a22aa201 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -1674,6 +1674,7 @@ def submit_one_test_job(analysis_path, type_system, flag_print_message=True): stdout=subprocess.PIPE) proc_cmd.check_returncode() + print(f"Return code: {proc_cmd.returncode}") msg = proc_cmd.stdout.decode('utf-8') if type_system == "sge": @@ -1685,7 +1686,13 @@ def submit_one_test_job(analysis_path, type_system, flag_print_message=True): # e.g., on MIT OpenMind: no 1st line from MSI; only 2nd line. else: raise Exception("type system can be slurm or sge") - job_id = int(job_id_str) + + # This is necessary SLURM commands can fail but have return code 0 + try: + job_id = int(job_id_str) + except ValueError as e: + raise ValueError(f"Cannot convert {job_id_str!r} into an int: {e}. " + f"That output is a result of running command {cmd} which produced output {msg}.") # log filename: log_filename = job_name + ".*" + job_id_str @@ -2072,8 +2079,12 @@ def get_last_line(fn): # remove spaces at the beginning or the end; remove '\n': last_line = last_line.strip().replace("\n", "") else: + print("empty file") + print(fn) last_line = np.nan else: # e.g., `qw` pending + print("file DNE") + print(fn) last_line = np.nan return last_line diff --git a/setup.cfg b/setup.cfg index 348a7117..698966d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ classifiers = [options] python_requires = >=3.7 install_requires = + backoff datalad >= 0.17.2 datalad_container >= 1.1.6 filelock >= 3.8.0 diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh new file mode 100755 index 00000000..48c9c846 --- /dev/null +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -0,0 +1,88 @@ +#!/bin/bash -i + +SUBPROJECT_NAME=test_project + +set -eu + +echo "We are now running as user $(whoami)" +echo "DEBUG: MINICONDA_PATH=${MINICONDA_PATH}" +echo "DEBUG: TESTDATA=${TESTDATA}" + +source "$MINICONDA_PATH/etc/profile.d/conda.sh" +conda activate babs + +# record the miniconda path so it can added to the test env (slurm jobs do not preserve env) +cat > /home/"$USER"/miniconda.env << EOF +. "$MINICONDA_PATH/etc/profile.d/conda.sh" +EOF + + + +git config --global user.name "e2e testuser" +git config --global user.email "testuser@example.com" +echo "Git user: $(git config user.name)" +echo "Git email: $(git config user.email)" + +# TODO switch back to osf project +# Populate input data (Divergent from tuturial, bc https://github.com/datalad/datalad-osf/issues/191 +pushd ${TESTDATA} +echo "Installing Input Data" +datalad install ///dbic/QA + +# Singularity image created by root, then chowned to this user, and datalad must be run as this user +datalad create -D "toy BIDS App" toybidsapp-container +pushd toybidsapp-container +datalad containers-add \ + --url ${PWD}/../toybidsapp-0.0.7.sif \ + toybidsapp-0-0-7 +popd +rm -f toybidsapp-0.0.7.sif + + +# TODO File Issue: --where_project must be abspath file issue for relative path +babs-init \ + --where_project "${PWD}" \ + --project_name $SUBPROJECT_NAME \ + --input BIDS "${PWD}"/QA \ + --container_ds "${PWD}"/toybidsapp-container \ + --container_name toybidsapp-0-0-7 \ + --container_config_yaml_file "${PWD}"/config_toybidsapp.yaml \ + --type_session multi-ses \ + --type_system slurm + +echo "PASSED: babs-init" +echo "Check setup, without job" +babs-check-setup --project_root "${PWD}"/test_project/ +echo "PASSED: Check setup, without job" + +babs-check-setup --project_root "${PWD}"/test_project/ --job-test +echo "Job submitted: Check setup, with job" + +babs-status --project_root "${PWD}"/test_project/ +# +# babs-submit --project_root "${PWD}"/test_project/ +# +# babs-status --project_root "${PWD}"/test_project/ +# sleep 30s +# babs-status --project_root "${PWD}"/test_project/ +# +# echo "Print job logs--------------------------------------------" +# find "${PWD}"/test_project/analysis/logs/* -type f -print -exec cat {} \; +# echo "end job logs--------------------------------------------" +# # TODO: babs-check-status-job +# +# # TODO babs-merge +# +# popd +# # /tests/e2e-slurm/babs-tests.sh +# # podman exec \ +# # -e MINICONDA_PATH=${MINICONDA_PATH} \ +# # slurm \ +# # ${PWD}/tests/e2e-slurm/babs-tests.sh +# # +# +# +# echo "--------------------------" +# echo " HUZZZZZZAHHHHHH!!!!!!" +# echo "--------------------------" +# diff --git a/tests/e2e-slurm/container/config_toybidsapp.yaml b/tests/e2e-slurm/container/config_toybidsapp.yaml new file mode 100644 index 00000000..b4839956 --- /dev/null +++ b/tests/e2e-slurm/container/config_toybidsapp.yaml @@ -0,0 +1,21 @@ +# Arguments in `singularity run`: +singularity_run: + --no-zipped: "" + --dummy: "2" + -v: "" + +# Output foldername(s) to be zipped, and the BIDS App version to be included in the zip filename(s): +zip_foldernames: + toybidsapp: "0-0-7" + +# How much cluster resources it needs: +cluster_resources: + interpreting_shell: /bin/bash + hard_memory_limit: 2G + +script_preamble: | + . ~/miniconda.env + conda activate babs + +# Where to run the jobs: +job_compute_space: "/tmp" diff --git a/tests/e2e-slurm/container/ensure-env.sh b/tests/e2e-slurm/container/ensure-env.sh new file mode 100755 index 00000000..67550ae1 --- /dev/null +++ b/tests/e2e-slurm/container/ensure-env.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# +# exported for use in inner-slurm.sh +if [ -z "${MINICONDA_PATH:-}" ]; then + if hash conda; then + export MINICONDA_PATH=$(/bin/which conda | xargs dirname | xargs dirname) + else + echo "ERROR: must have MINICONDA_PATH set or have 'conda' available" + exit 1 + fi +fi diff --git a/tests/e2e-slurm/container/rerun.sh b/tests/e2e-slurm/container/rerun.sh new file mode 100755 index 00000000..23b1d254 --- /dev/null +++ b/tests/e2e-slurm/container/rerun.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +su "testuser" "rm -rf ${TESTDATA}" +cp /opt/outer/* "${TESTDATA}" + +su "${BABS_USER}" "${TESTDATA}/babs-user-script.sh" diff --git a/tests/e2e-slurm/container/walkthrough-tests.sh b/tests/e2e-slurm/container/walkthrough-tests.sh new file mode 100755 index 00000000..df586761 --- /dev/null +++ b/tests/e2e-slurm/container/walkthrough-tests.sh @@ -0,0 +1,58 @@ +#!/bin/bash -i + +set -eu + +# add that outside user +# groupadd --gid "$GID" "$USER" && useradd --uid $UID --gid "$GID" "$USER" + +# Install singularity inside the container +yum update -y && yum install -y epel-release && yum update -y && yum install -y singularity-runtime apptainer +# +# git version +# git config user.name > /dev/null || git config --system user.name "e2e slurm" +# git config user.email > /dev/null || git config --system user.email "fake@example.com" +# git config --system --add safe.directory '*' + +export TESTDATA=/opt/testdata +BABS_USER=testuser + + +# Wait for slurm to be up +max_retries=10 +delay=10 # seconds + +echo "Try connecting to slurm with sacct until it succeeds" +set +e # We need to check the error code and allow failures until slurm has started up +export PATH=${PWD}/tests/e2e-slurm/bin/:${PATH} +for ((i=1; i<=max_retries; i++)); do + # Check if the command was successful + if sacct; then + echo "Slurm is up and running!" + break + else + echo "Waiting for Slurm to start... retry $i/$max_retries" + sleep $delay + fi + # exit if max retries reached + if [ $i -eq $max_retries ]; then + echo "Failed to start Slurm after $max_retries attempts." + exit 1 + fi +done +set -e + +# Currently we are root inside the container. Now we create a user to own the testdata +useradd "$BABS_USER" +# cp rather than use bind directly so it can be owned by the container user and not cause issues outside +mkdir "${TESTDATA}" +cp /opt/outer/* "${TESTDATA}" + + +# We build the singularity container now while we are root, and use it later as testuser +pushd "${TESTDATA}" +singularity build \ + toybidsapp-0.0.7.sif \ + docker://pennlinc/toy_bids_app:0.0.7 + +chown -R "$BABS_USER:$BABS_USER" "${TESTDATA}" +su "${BABS_USER}" "${TESTDATA}/babs-user-script.sh" diff --git a/tests/e2e-slurm/install-babs.sh b/tests/e2e-slurm/install-babs.sh new file mode 100755 index 00000000..b9531fc8 --- /dev/null +++ b/tests/e2e-slurm/install-babs.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -eu + +. tests/e2e-slurm/container/ensure-env.sh + +conda install -c conda-forge datalad git git-annex -y + +# Optional dependencies, required for e2e-slurm +pip install datalad_container +pip install datalad-osf + +# TODO non-dynamic for prod +# pip install . +pip install -e . diff --git a/tests/e2e-slurm/main.sh b/tests/e2e-slurm/main.sh new file mode 100755 index 00000000..101a72de --- /dev/null +++ b/tests/e2e-slurm/main.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +set -eux + +# Expects: Conda env to be activated +# Expects: Babs to be installed +# +# WIP-NOT-WORKING +# Reminder :Z for selinux + +# TODO switch back to upstream after build +# Currently using asmacdo, OpenSSL bump upstream, but no new docker build +# https://github.com/giovtorres/docker-centos7-slurm/pull/49 +REGISTRY=docker.io +HUBUSER=asmacdo +# HUBUSER=giovtorres +REPO=centos7-slurm +# REPO=docker-centos7-slurm +TAG=23.11.07 # TODO + +FQDN_IMAGE=${REGISTRY}/${HUBUSER}/${REPO}:${TAG} +THIS_DIR="$(readlink -f "$0" | xargs dirname )" +TESTDATA=/opt/testdata + +. tests/e2e-slurm/container/ensure-env.sh + +if [ "$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin/babs-init" != "$(which babs-init)" ]; then + echo "Error: This script expects to be run inside a conda env with 'babs-init'!" >&2 + echo " We have not found it in conda env '$CONDA_DEFAULT_ENV' under '$MINICONDA_PATH'" >&2 + exit 1 +fi + +stop_container () { + podman stop slurm || true +} + +echo "Success, we are in the conda env with babs-init!" + # Because babs is dev-installed from here. TODO: we can remove if we remove -e from pip install +podman run -it --rm \ + --name slurm \ + --hostname slurmctl \ + -e "MINICONDA_PATH=${MINICONDA_PATH}" \ + --privileged \ + -v "${PWD}:${PWD}:ro,Z" \ + -v "${MINICONDA_PATH}:${MINICONDA_PATH}:Z" \ + -v "${THIS_DIR}/container:/opt/outer:ro,Z" \ + "${FQDN_IMAGE}" \ + /bin/bash -c ". /opt/outer/walkthrough-tests.sh" # TODO keep these logs? + +# trap stop_container EXIT From 4727b55ce9cfa17cd58d5fa29b35ae882c22e99a Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Thu, 11 Jan 2024 11:34:20 -0500 Subject: [PATCH 3/6] Add babs-submit and babs-merge --- tests/e2e-slurm/container/babs-user-script.sh | 65 +++++++++++++++++++ .../e2e-slurm/container/walkthrough-tests.sh | 2 +- tests/e2e-slurm/main.sh | 3 +- 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh index 48c9c846..60c7c97e 100755 --- a/tests/e2e-slurm/container/babs-user-script.sh +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -59,6 +59,71 @@ babs-check-setup --project_root "${PWD}"/test_project/ --job-test echo "Job submitted: Check setup, with job" babs-status --project_root "${PWD}"/test_project/ + +# Wait for all running jobs to finish +while [[ -n $(squeue -u $USER -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u $USER -t RUNNING,PENDING" + squeue -u $USER -t RUNNING,PENDING + echo "Waiting for running jobs to finish..." + sleep 5 # Wait for 60 seconds before checking again +done + +echo "No running jobs." + +# TODO make sure this works +# Check for failed jobs TODO state filter doesnt seem to be working as expected +# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then +if sacct -u $USER --noheader | grep -q "FAILED"; then + sacct -u $USER + echo "There are failed jobs." + exit 1 # Exit with failure status +else + sacct -u $USER + echo "PASSED: No failed jobs." +fi + +babs-submit --project-root "${PWD}/test_project/" + +# # Wait for all running jobs to finish +while [[ -n $(squeue -u $USER -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u $USER -t RUNNING,PENDING" + squeue -u $USER -t RUNNING,PENDING + echo "Waiting for running jobs to finish..." + sleep 5 # Wait for 60 seconds before checking again +done + +echo "=========================================================================" +echo "babs-status:" +babs-status --project_root "${PWD}"/test_project/ +echo "=========================================================================" + +# Check for failed jobs TODO state filter doesnt seem to be working as expected +# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then +if sacct -u $USER --noheader | grep -q "FAILED"; then + sacct -u $USER + echo "=========================================================================" + echo "There are failed jobs." + exit 1 # Exit with failure status +else + sacct -u $USER + echo "=========================================================================" + echo "PASSED: No failed jobs." +fi + +babs-merge --project_root "${PWD}"/test_project/ + + +# TODO: we need to fail if there is a failed job +# fi + +# sleep 10 +# babs-status --project_root "${PWD}"/test_project/ +# sleep 10 +# babs-status --project_root "${PWD}"/test_project/ +# sleep 10 +# babs-status --project_root "${PWD}"/test_project/ +# sleep 10 +# babs-status --project_root "${PWD}"/test_project/ # # babs-submit --project_root "${PWD}"/test_project/ # diff --git a/tests/e2e-slurm/container/walkthrough-tests.sh b/tests/e2e-slurm/container/walkthrough-tests.sh index df586761..27f8b13a 100755 --- a/tests/e2e-slurm/container/walkthrough-tests.sh +++ b/tests/e2e-slurm/container/walkthrough-tests.sh @@ -26,7 +26,7 @@ set +e # We need to check the error code and allow failures until slurm has star export PATH=${PWD}/tests/e2e-slurm/bin/:${PATH} for ((i=1; i<=max_retries; i++)); do # Check if the command was successful - if sacct; then + if sacct > /dev/null; then echo "Slurm is up and running!" break else diff --git a/tests/e2e-slurm/main.sh b/tests/e2e-slurm/main.sh index 101a72de..b4b8125d 100755 --- a/tests/e2e-slurm/main.sh +++ b/tests/e2e-slurm/main.sh @@ -45,6 +45,7 @@ podman run -it --rm \ -v "${MINICONDA_PATH}:${MINICONDA_PATH}:Z" \ -v "${THIS_DIR}/container:/opt/outer:ro,Z" \ "${FQDN_IMAGE}" \ - /bin/bash -c ". /opt/outer/walkthrough-tests.sh" # TODO keep these logs? + /bin/bash -c ". /opt/outer/walkthrough-tests.sh" + #/bin/bash -c ". /opt/outer/walkthrough-tests.sh && bash" # TODO remove, for debug only # trap stop_container EXIT From 35f56120ce5a0408d13e49a7b8fb36553f9c7281 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 12 Jan 2024 10:03:49 -0500 Subject: [PATCH 4/6] remove dev artifacts and codespell --- Makefile | 3 --- babs/utils.py | 4 ---- tests/e2e-slurm/container/babs-user-script.sh | 4 ++-- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index b0375260..03949a2e 100644 --- a/Makefile +++ b/Makefile @@ -7,9 +7,6 @@ setup-user: e2e: clean ./tests/e2e-slurm/main.sh -build: clean - podman build -f tests/e2e-slurm/Containerfile . -t testss - # TODO testdata variable clean: podman stop slurm 2>/dev/null || true diff --git a/babs/utils.py b/babs/utils.py index a22aa201..f6ca0c50 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -2079,12 +2079,8 @@ def get_last_line(fn): # remove spaces at the beginning or the end; remove '\n': last_line = last_line.strip().replace("\n", "") else: - print("empty file") - print(fn) last_line = np.nan else: # e.g., `qw` pending - print("file DNE") - print(fn) last_line = np.nan return last_line diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh index 60c7c97e..c9bec9d4 100755 --- a/tests/e2e-slurm/container/babs-user-script.sh +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -71,7 +71,7 @@ done echo "No running jobs." # TODO make sure this works -# Check for failed jobs TODO state filter doesnt seem to be working as expected +# Check for failed jobs TODO state filter doesn't seem to be working as expected # if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then if sacct -u $USER --noheader | grep -q "FAILED"; then sacct -u $USER @@ -97,7 +97,7 @@ echo "babs-status:" babs-status --project_root "${PWD}"/test_project/ echo "=========================================================================" -# Check for failed jobs TODO state filter doesnt seem to be working as expected +# Check for failed jobs TODO see above # if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then if sacct -u $USER --noheader | grep -q "FAILED"; then sacct -u $USER From 3508d42d8e86d11d173e3af68969d5a6eae5c993 Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Fri, 12 Jan 2024 10:23:36 -0500 Subject: [PATCH 5/6] Add shellcheck and fix --- .github/workflows/shellcheck.yml | 24 +++++++++++++++ tests/e2e-slurm/container/babs-user-script.sh | 30 ++++++++++--------- tests/e2e-slurm/container/ensure-env.sh | 2 ++ tests/e2e-slurm/container/rerun.sh | 6 ---- .../e2e-slurm/container/walkthrough-tests.sh | 2 +- tests/e2e-slurm/main.sh | 1 - 6 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/shellcheck.yml delete mode 100755 tests/e2e-slurm/container/rerun.sh diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 00000000..2cc56d4d --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,24 @@ +name: Shellcheck scripts + +on: [push, pull_request] + +jobs: + test: + + runs-on: ubuntu-latest + + steps: + - name: Set up system + shell: bash + run: | + sudo apt-get update -qq + sudo apt-get install shellcheck + - uses: actions/checkout@v4 + - name: Run shellcheck + run: | + shellcheck \ + tests/e2e-slurm/container/babs-user-script.sh \ + tests/e2e-slurm/container/ensure-env.sh \ + tests/e2e-slurm/container/walkthrough-tests.sh \ + tests/e2e-slurm/install-babs.sh \ + tests/e2e-slurm/main.sh diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh index c9bec9d4..166a178b 100755 --- a/tests/e2e-slurm/container/babs-user-script.sh +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -8,6 +8,8 @@ echo "We are now running as user $(whoami)" echo "DEBUG: MINICONDA_PATH=${MINICONDA_PATH}" echo "DEBUG: TESTDATA=${TESTDATA}" +# without MINICONDA_PATH set, shellcheck cannot follow +# shellcheck disable=SC1091 source "$MINICONDA_PATH/etc/profile.d/conda.sh" conda activate babs @@ -25,7 +27,7 @@ echo "Git email: $(git config user.email)" # TODO switch back to osf project # Populate input data (Divergent from tuturial, bc https://github.com/datalad/datalad-osf/issues/191 -pushd ${TESTDATA} +pushd "${TESTDATA}" echo "Installing Input Data" datalad install ///dbic/QA @@ -33,7 +35,7 @@ datalad install ///dbic/QA datalad create -D "toy BIDS App" toybidsapp-container pushd toybidsapp-container datalad containers-add \ - --url ${PWD}/../toybidsapp-0.0.7.sif \ + --url "${PWD}/../toybidsapp-0.0.7.sif" \ toybidsapp-0-0-7 popd rm -f toybidsapp-0.0.7.sif @@ -61,9 +63,9 @@ echo "Job submitted: Check setup, with job" babs-status --project_root "${PWD}"/test_project/ # Wait for all running jobs to finish -while [[ -n $(squeue -u $USER -t RUNNING,PENDING --noheader) ]]; do - echo "squeue -u $USER -t RUNNING,PENDING" - squeue -u $USER -t RUNNING,PENDING +while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u \"$USER\" -t RUNNING,PENDING" + squeue -u "$USER" -t RUNNING,PENDING echo "Waiting for running jobs to finish..." sleep 5 # Wait for 60 seconds before checking again done @@ -73,21 +75,21 @@ echo "No running jobs." # TODO make sure this works # Check for failed jobs TODO state filter doesn't seem to be working as expected # if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then -if sacct -u $USER --noheader | grep -q "FAILED"; then - sacct -u $USER +if sacct -u "$USER" --noheader | grep -q "FAILED"; then + sacct -u "$USER" echo "There are failed jobs." exit 1 # Exit with failure status else - sacct -u $USER + sacct -u "$USER" echo "PASSED: No failed jobs." fi babs-submit --project-root "${PWD}/test_project/" # # Wait for all running jobs to finish -while [[ -n $(squeue -u $USER -t RUNNING,PENDING --noheader) ]]; do - echo "squeue -u $USER -t RUNNING,PENDING" - squeue -u $USER -t RUNNING,PENDING +while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do + echo "squeue -u \"$USER\" -t RUNNING,PENDING" + squeue -u "$USER" -t RUNNING,PENDING echo "Waiting for running jobs to finish..." sleep 5 # Wait for 60 seconds before checking again done @@ -99,13 +101,13 @@ echo "=========================================================================" # Check for failed jobs TODO see above # if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then -if sacct -u $USER --noheader | grep -q "FAILED"; then - sacct -u $USER +if sacct -u "$USER" --noheader | grep -q "FAILED"; then + sacct -u "$USER" echo "=========================================================================" echo "There are failed jobs." exit 1 # Exit with failure status else - sacct -u $USER + sacct -u "$USER" echo "=========================================================================" echo "PASSED: No failed jobs." fi diff --git a/tests/e2e-slurm/container/ensure-env.sh b/tests/e2e-slurm/container/ensure-env.sh index 67550ae1..fee3162b 100755 --- a/tests/e2e-slurm/container/ensure-env.sh +++ b/tests/e2e-slurm/container/ensure-env.sh @@ -3,6 +3,8 @@ # exported for use in inner-slurm.sh if [ -z "${MINICONDA_PATH:-}" ]; then if hash conda; then + # We don't need the return value, we already catch the error + # shellcheck disable=SC2155 export MINICONDA_PATH=$(/bin/which conda | xargs dirname | xargs dirname) else echo "ERROR: must have MINICONDA_PATH set or have 'conda' available" diff --git a/tests/e2e-slurm/container/rerun.sh b/tests/e2e-slurm/container/rerun.sh deleted file mode 100755 index 23b1d254..00000000 --- a/tests/e2e-slurm/container/rerun.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -su "testuser" "rm -rf ${TESTDATA}" -cp /opt/outer/* "${TESTDATA}" - -su "${BABS_USER}" "${TESTDATA}/babs-user-script.sh" diff --git a/tests/e2e-slurm/container/walkthrough-tests.sh b/tests/e2e-slurm/container/walkthrough-tests.sh index 27f8b13a..4c36f436 100755 --- a/tests/e2e-slurm/container/walkthrough-tests.sh +++ b/tests/e2e-slurm/container/walkthrough-tests.sh @@ -34,7 +34,7 @@ for ((i=1; i<=max_retries; i++)); do sleep $delay fi # exit if max retries reached - if [ $i -eq $max_retries ]; then + if [ "$i" -eq "$max_retries" ]; then echo "Failed to start Slurm after $max_retries attempts." exit 1 fi diff --git a/tests/e2e-slurm/main.sh b/tests/e2e-slurm/main.sh index b4b8125d..c50b002b 100755 --- a/tests/e2e-slurm/main.sh +++ b/tests/e2e-slurm/main.sh @@ -20,7 +20,6 @@ TAG=23.11.07 # TODO FQDN_IMAGE=${REGISTRY}/${HUBUSER}/${REPO}:${TAG} THIS_DIR="$(readlink -f "$0" | xargs dirname )" -TESTDATA=/opt/testdata . tests/e2e-slurm/container/ensure-env.sh From da66a12895208341894e88ede317a704c5ec022d Mon Sep 17 00:00:00 2001 From: Austin Macdonald Date: Tue, 16 Jan 2024 10:47:41 -0500 Subject: [PATCH 6/6] cleanup --- .gitignore | 3 -- Makefile | 10 ++--- babs/babs.py | 4 -- babs/utils.py | 1 - tests/e2e-slurm/container/babs-user-script.sh | 42 +------------------ tests/e2e-slurm/container/ensure-env.sh | 1 - .../e2e-slurm/container/walkthrough-tests.sh | 16 ++----- tests/e2e-slurm/install-babs.sh | 4 +- tests/e2e-slurm/main.sh | 11 ++--- 9 files changed, 13 insertions(+), 79 deletions(-) diff --git a/.gitignore b/.gitignore index cf103c72..de79a6da 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,3 @@ build/ # Distribution / packaging dist/ babs/VERSION - -# e2e testdata -.testdata* diff --git a/Makefile b/Makefile index 03949a2e..a6143630 100644 --- a/Makefile +++ b/Makefile @@ -7,13 +7,9 @@ setup-user: e2e: clean ./tests/e2e-slurm/main.sh -# TODO testdata variable clean: - podman stop slurm 2>/dev/null || true - podman rm slurm 2>/dev/null || true - [ -e .testdata/babs_test_project/toybidsapp-container ] && \ + @ podman stop slurm 2>/dev/null || true + @ podman rm slurm 2>/dev/null || true + @[ -e .testdata/babs_test_project/toybidsapp-container ] && \ datalad remove -d .testdata/babs_test_project/toybidsapp-container --reckless kill || : rm -rf .testdata - -logs: - cat .testdata/ci-logs/* diff --git a/babs/babs.py b/babs/babs.py index 4c9025a2..f9719fe1 100644 --- a/babs/babs.py +++ b/babs/babs.py @@ -2778,14 +2778,10 @@ def generate_job_submit_template(self, yaml_path, babs, system, test=False): env_flags = "-v DSLOCKFILE=" + babs.analysis_path + "/.SGE_datalad_lock" elif system.type == "slurm": submit_head = "sbatch" - # TODO: asmacdo env_flags = "--export=DSLOCKFILE=" + babs.analysis_path + "/.SLURM_datalad_lock" else: warnings.warn("not supporting systems other than sge...") - # TODO: rm asmacdo hack - # env_flags = env_flags + f",MINICONDA_PATH={os.getenv('MINICONDA_PATH')}" - # Check if the bash file already exist: if op.exists(yaml_path): os.remove(yaml_path) # remove it diff --git a/babs/utils.py b/babs/utils.py index f6ca0c50..d8103475 100644 --- a/babs/utils.py +++ b/babs/utils.py @@ -1674,7 +1674,6 @@ def submit_one_test_job(analysis_path, type_system, flag_print_message=True): stdout=subprocess.PIPE) proc_cmd.check_returncode() - print(f"Return code: {proc_cmd.returncode}") msg = proc_cmd.stdout.decode('utf-8') if type_system == "sge": diff --git a/tests/e2e-slurm/container/babs-user-script.sh b/tests/e2e-slurm/container/babs-user-script.sh index 166a178b..1a484a93 100755 --- a/tests/e2e-slurm/container/babs-user-script.sh +++ b/tests/e2e-slurm/container/babs-user-script.sh @@ -4,6 +4,7 @@ SUBPROJECT_NAME=test_project set -eu +echo "==============================================================" echo "We are now running as user $(whoami)" echo "DEBUG: MINICONDA_PATH=${MINICONDA_PATH}" echo "DEBUG: TESTDATA=${TESTDATA}" @@ -113,43 +114,4 @@ else fi babs-merge --project_root "${PWD}"/test_project/ - - -# TODO: we need to fail if there is a failed job -# fi - -# sleep 10 -# babs-status --project_root "${PWD}"/test_project/ -# sleep 10 -# babs-status --project_root "${PWD}"/test_project/ -# sleep 10 -# babs-status --project_root "${PWD}"/test_project/ -# sleep 10 -# babs-status --project_root "${PWD}"/test_project/ -# -# babs-submit --project_root "${PWD}"/test_project/ -# -# babs-status --project_root "${PWD}"/test_project/ -# sleep 30s -# babs-status --project_root "${PWD}"/test_project/ -# -# echo "Print job logs--------------------------------------------" -# find "${PWD}"/test_project/analysis/logs/* -type f -print -exec cat {} \; -# echo "end job logs--------------------------------------------" -# # TODO: babs-check-status-job -# -# # TODO babs-merge -# -# popd -# # /tests/e2e-slurm/babs-tests.sh -# # podman exec \ -# # -e MINICONDA_PATH=${MINICONDA_PATH} \ -# # slurm \ -# # ${PWD}/tests/e2e-slurm/babs-tests.sh -# # -# -# -# echo "--------------------------" -# echo " HUZZZZZZAHHHHHH!!!!!!" -# echo "--------------------------" -# +echo "PASSED: e2e walkthrough successful!" diff --git a/tests/e2e-slurm/container/ensure-env.sh b/tests/e2e-slurm/container/ensure-env.sh index fee3162b..352ed3a7 100755 --- a/tests/e2e-slurm/container/ensure-env.sh +++ b/tests/e2e-slurm/container/ensure-env.sh @@ -1,6 +1,5 @@ #!/bin/bash # -# exported for use in inner-slurm.sh if [ -z "${MINICONDA_PATH:-}" ]; then if hash conda; then # We don't need the return value, we already catch the error diff --git a/tests/e2e-slurm/container/walkthrough-tests.sh b/tests/e2e-slurm/container/walkthrough-tests.sh index 4c36f436..1c57f2d9 100755 --- a/tests/e2e-slurm/container/walkthrough-tests.sh +++ b/tests/e2e-slurm/container/walkthrough-tests.sh @@ -1,21 +1,13 @@ #!/bin/bash -i - +# Here we perform all actions that must be done as root inside the container and then +# execute the walkthrough as BABS_USER set -eu -# add that outside user -# groupadd --gid "$GID" "$USER" && useradd --uid $UID --gid "$GID" "$USER" - -# Install singularity inside the container -yum update -y && yum install -y epel-release && yum update -y && yum install -y singularity-runtime apptainer -# -# git version -# git config user.name > /dev/null || git config --system user.name "e2e slurm" -# git config user.email > /dev/null || git config --system user.email "fake@example.com" -# git config --system --add safe.directory '*' - export TESTDATA=/opt/testdata BABS_USER=testuser +# Install singularity inside the container +yum update -y && yum install -y epel-release && yum update -y && yum install -y singularity-runtime apptainer # Wait for slurm to be up max_retries=10 diff --git a/tests/e2e-slurm/install-babs.sh b/tests/e2e-slurm/install-babs.sh index b9531fc8..a0674743 100755 --- a/tests/e2e-slurm/install-babs.sh +++ b/tests/e2e-slurm/install-babs.sh @@ -10,6 +10,4 @@ conda install -c conda-forge datalad git git-annex -y pip install datalad_container pip install datalad-osf -# TODO non-dynamic for prod -# pip install . -pip install -e . +pip install . diff --git a/tests/e2e-slurm/main.sh b/tests/e2e-slurm/main.sh index c50b002b..f4e35a4b 100755 --- a/tests/e2e-slurm/main.sh +++ b/tests/e2e-slurm/main.sh @@ -21,6 +21,7 @@ TAG=23.11.07 # TODO FQDN_IMAGE=${REGISTRY}/${HUBUSER}/${REPO}:${TAG} THIS_DIR="$(readlink -f "$0" | xargs dirname )" +# Sets MINICONDA_PATH . tests/e2e-slurm/container/ensure-env.sh if [ "$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin/babs-init" != "$(which babs-init)" ]; then @@ -29,12 +30,9 @@ if [ "$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin/babs-init" != "$(which babs-in exit 1 fi -stop_container () { - podman stop slurm || true -} - echo "Success, we are in the conda env with babs-init!" - # Because babs is dev-installed from here. TODO: we can remove if we remove -e from pip install + +# PWD shared so babs can be optionally be installed with develop install podman run -it --rm \ --name slurm \ --hostname slurmctl \ @@ -45,6 +43,3 @@ podman run -it --rm \ -v "${THIS_DIR}/container:/opt/outer:ro,Z" \ "${FQDN_IMAGE}" \ /bin/bash -c ". /opt/outer/walkthrough-tests.sh" - - #/bin/bash -c ". /opt/outer/walkthrough-tests.sh && bash" # TODO remove, for debug only -# trap stop_container EXIT