Skip to content

Commit

Permalink
Merge pull request #169 from asmacdo/e2e-slurm
Browse files Browse the repository at this point in the history
Introducing e2e slurm tests
  • Loading branch information
mattcieslak authored Feb 28, 2024
2 parents a637627 + da66a12 commit a9c1aa7
Show file tree
Hide file tree
Showing 12 changed files with 342 additions and 8 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/e2e-slurm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
name: Slurm

on: [push]
jobs:
e2e-slurm:
name: Test e2e with SLURM
runs-on: ubuntu-latest
steps:
- name: checkout our repo
uses: actions/checkout@v4
- name: Install apptainer
uses: eWaterCycle/setup-apptainer@v2
with:
apptainer-version: 1.1.2
- name: Install Conda
uses: conda-incubator/setup-miniconda@v3
with:
activate-environment: babs
auto-update-conda: true
python-version: 3.9
- name: Conda info
shell: bash -el {0}
run: conda info
- name: Install Babs
shell: bash -el {0}
run: ./tests/e2e-slurm/install-babs.sh
- name: Execute e2e with SLURM
shell: bash -el {0}
run: ./tests/e2e-slurm/main.sh
24 changes: 24 additions & 0 deletions .github/workflows/shellcheck.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Shellcheck scripts

on: [push, pull_request]

jobs:
test:

runs-on: ubuntu-latest

steps:
- name: Set up system
shell: bash
run: |
sudo apt-get update -qq
sudo apt-get install shellcheck
- uses: actions/checkout@v4
- name: Run shellcheck
run: |
shellcheck \
tests/e2e-slurm/container/babs-user-script.sh \
tests/e2e-slurm/container/ensure-env.sh \
tests/e2e-slurm/container/walkthrough-tests.sh \
tests/e2e-slurm/install-babs.sh \
tests/e2e-slurm/main.sh
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ build/

# Distribution / packaging
dist/
babs/VERSION
babs/VERSION
15 changes: 15 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
install:
./tests/e2e-slurm/install-babs.sh

setup-user:
./tests/e2e-slurm/setup-user.sh

e2e: clean
./tests/e2e-slurm/main.sh

clean:
@ podman stop slurm 2>/dev/null || true
@ podman rm slurm 2>/dev/null || true
@[ -e .testdata/babs_test_project/toybidsapp-container ] && \
datalad remove -d .testdata/babs_test_project/toybidsapp-container --reckless kill || :
rm -rf .testdata
8 changes: 7 additions & 1 deletion babs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,7 +1685,13 @@ def submit_one_test_job(analysis_path, type_system, flag_print_message=True):
# e.g., on MIT OpenMind: no 1st line from MSI; only 2nd line.
else:
raise Exception("type system can be slurm or sge")
job_id = int(job_id_str)

# This is necessary SLURM commands can fail but have return code 0
try:
job_id = int(job_id_str)
except ValueError as e:
raise ValueError(f"Cannot convert {job_id_str!r} into an int: {e}. "
f"That output is a result of running command {cmd} which produced output {msg}.")

# log filename:
log_filename = job_name + ".*" + job_id_str
Expand Down
13 changes: 7 additions & 6 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@ classifiers =
[options]
python_requires = >=3.7
install_requires =
backoff
datalad >= 0.17.2
datalad_container >= 1.1.6
filelock >= 3.8.0
nibabel >=2.2.1
numpy
pandas
tqdm
pyyaml >= 6.0
#ruamel.yaml >= 0.17.21
datalad >= 0.17.2
datalad_container >= 1.1.6
regex
filelock >= 3.8.0
qstat >= 0.0.5
regex
#ruamel.yaml >= 0.17.21
tqdm
packages = find:
include_package_data = True

Expand Down
117 changes: 117 additions & 0 deletions tests/e2e-slurm/container/babs-user-script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/bin/bash -i

SUBPROJECT_NAME=test_project

set -eu

echo "=============================================================="
echo "We are now running as user $(whoami)"
echo "DEBUG: MINICONDA_PATH=${MINICONDA_PATH}"
echo "DEBUG: TESTDATA=${TESTDATA}"

# without MINICONDA_PATH set, shellcheck cannot follow
# shellcheck disable=SC1091
source "$MINICONDA_PATH/etc/profile.d/conda.sh"
conda activate babs

# record the miniconda path so it can added to the test env (slurm jobs do not preserve env)
cat > /home/"$USER"/miniconda.env << EOF
. "$MINICONDA_PATH/etc/profile.d/conda.sh"
EOF



git config --global user.name "e2e testuser"
git config --global user.email "[email protected]"
echo "Git user: $(git config user.name)"
echo "Git email: $(git config user.email)"

# TODO switch back to osf project
# Populate input data (Divergent from tuturial, bc https://github.com/datalad/datalad-osf/issues/191
pushd "${TESTDATA}"
echo "Installing Input Data"
datalad install ///dbic/QA

# Singularity image created by root, then chowned to this user, and datalad must be run as this user
datalad create -D "toy BIDS App" toybidsapp-container
pushd toybidsapp-container
datalad containers-add \
--url "${PWD}/../toybidsapp-0.0.7.sif" \
toybidsapp-0-0-7
popd
rm -f toybidsapp-0.0.7.sif


# TODO File Issue: --where_project must be abspath file issue for relative path
babs-init \
--where_project "${PWD}" \
--project_name $SUBPROJECT_NAME \
--input BIDS "${PWD}"/QA \
--container_ds "${PWD}"/toybidsapp-container \
--container_name toybidsapp-0-0-7 \
--container_config_yaml_file "${PWD}"/config_toybidsapp.yaml \
--type_session multi-ses \
--type_system slurm

echo "PASSED: babs-init"
echo "Check setup, without job"
babs-check-setup --project_root "${PWD}"/test_project/
echo "PASSED: Check setup, without job"

babs-check-setup --project_root "${PWD}"/test_project/ --job-test
echo "Job submitted: Check setup, with job"

babs-status --project_root "${PWD}"/test_project/

# Wait for all running jobs to finish
while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do
echo "squeue -u \"$USER\" -t RUNNING,PENDING"
squeue -u "$USER" -t RUNNING,PENDING
echo "Waiting for running jobs to finish..."
sleep 5 # Wait for 60 seconds before checking again
done

echo "No running jobs."

# TODO make sure this works
# Check for failed jobs TODO state filter doesn't seem to be working as expected
# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then
if sacct -u "$USER" --noheader | grep -q "FAILED"; then
sacct -u "$USER"
echo "There are failed jobs."
exit 1 # Exit with failure status
else
sacct -u "$USER"
echo "PASSED: No failed jobs."
fi

babs-submit --project-root "${PWD}/test_project/"

# # Wait for all running jobs to finish
while [[ -n $(squeue -u "$USER" -t RUNNING,PENDING --noheader) ]]; do
echo "squeue -u \"$USER\" -t RUNNING,PENDING"
squeue -u "$USER" -t RUNNING,PENDING
echo "Waiting for running jobs to finish..."
sleep 5 # Wait for 60 seconds before checking again
done

echo "========================================================================="
echo "babs-status:"
babs-status --project_root "${PWD}"/test_project/
echo "========================================================================="

# Check for failed jobs TODO see above
# if sacct -u $USER --state=FAILED --noheader | grep -q "FAILED"; then
if sacct -u "$USER" --noheader | grep -q "FAILED"; then
sacct -u "$USER"
echo "========================================================================="
echo "There are failed jobs."
exit 1 # Exit with failure status
else
sacct -u "$USER"
echo "========================================================================="
echo "PASSED: No failed jobs."
fi

babs-merge --project_root "${PWD}"/test_project/
echo "PASSED: e2e walkthrough successful!"
21 changes: 21 additions & 0 deletions tests/e2e-slurm/container/config_toybidsapp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Arguments in `singularity run`:
singularity_run:
--no-zipped: ""
--dummy: "2"
-v: ""

# Output foldername(s) to be zipped, and the BIDS App version to be included in the zip filename(s):
zip_foldernames:
toybidsapp: "0-0-7"

# How much cluster resources it needs:
cluster_resources:
interpreting_shell: /bin/bash
hard_memory_limit: 2G

script_preamble: |
. ~/miniconda.env
conda activate babs
# Where to run the jobs:
job_compute_space: "/tmp"
12 changes: 12 additions & 0 deletions tests/e2e-slurm/container/ensure-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
#
if [ -z "${MINICONDA_PATH:-}" ]; then
if hash conda; then
# We don't need the return value, we already catch the error
# shellcheck disable=SC2155
export MINICONDA_PATH=$(/bin/which conda | xargs dirname | xargs dirname)
else
echo "ERROR: must have MINICONDA_PATH set or have 'conda' available"
exit 1
fi
fi
50 changes: 50 additions & 0 deletions tests/e2e-slurm/container/walkthrough-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash -i
# Here we perform all actions that must be done as root inside the container and then
# execute the walkthrough as BABS_USER
set -eu

export TESTDATA=/opt/testdata
BABS_USER=testuser

# Install singularity inside the container
yum update -y && yum install -y epel-release && yum update -y && yum install -y singularity-runtime apptainer

# Wait for slurm to be up
max_retries=10
delay=10 # seconds

echo "Try connecting to slurm with sacct until it succeeds"
set +e # We need to check the error code and allow failures until slurm has started up
export PATH=${PWD}/tests/e2e-slurm/bin/:${PATH}
for ((i=1; i<=max_retries; i++)); do
# Check if the command was successful
if sacct > /dev/null; then
echo "Slurm is up and running!"
break
else
echo "Waiting for Slurm to start... retry $i/$max_retries"
sleep $delay
fi
# exit if max retries reached
if [ "$i" -eq "$max_retries" ]; then
echo "Failed to start Slurm after $max_retries attempts."
exit 1
fi
done
set -e

# Currently we are root inside the container. Now we create a user to own the testdata
useradd "$BABS_USER"
# cp rather than use bind directly so it can be owned by the container user and not cause issues outside
mkdir "${TESTDATA}"
cp /opt/outer/* "${TESTDATA}"


# We build the singularity container now while we are root, and use it later as testuser
pushd "${TESTDATA}"
singularity build \
toybidsapp-0.0.7.sif \
docker://pennlinc/toy_bids_app:0.0.7

chown -R "$BABS_USER:$BABS_USER" "${TESTDATA}"
su "${BABS_USER}" "${TESTDATA}/babs-user-script.sh"
13 changes: 13 additions & 0 deletions tests/e2e-slurm/install-babs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

set -eu

. tests/e2e-slurm/container/ensure-env.sh

conda install -c conda-forge datalad git git-annex -y

# Optional dependencies, required for e2e-slurm
pip install datalad_container
pip install datalad-osf

pip install .
45 changes: 45 additions & 0 deletions tests/e2e-slurm/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#
set -eux

# Expects: Conda env to be activated
# Expects: Babs to be installed
#
# WIP-NOT-WORKING
# Reminder :Z for selinux

# TODO switch back to upstream after build
# Currently using asmacdo, OpenSSL bump upstream, but no new docker build
# https://github.com/giovtorres/docker-centos7-slurm/pull/49
REGISTRY=docker.io
HUBUSER=asmacdo
# HUBUSER=giovtorres
REPO=centos7-slurm
# REPO=docker-centos7-slurm
TAG=23.11.07 # TODO

FQDN_IMAGE=${REGISTRY}/${HUBUSER}/${REPO}:${TAG}
THIS_DIR="$(readlink -f "$0" | xargs dirname )"

# Sets MINICONDA_PATH
. tests/e2e-slurm/container/ensure-env.sh

if [ "$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin/babs-init" != "$(which babs-init)" ]; then
echo "Error: This script expects to be run inside a conda env with 'babs-init'!" >&2
echo " We have not found it in conda env '$CONDA_DEFAULT_ENV' under '$MINICONDA_PATH'" >&2
exit 1
fi

echo "Success, we are in the conda env with babs-init!"

# PWD shared so babs can be optionally be installed with develop install
podman run -it --rm \
--name slurm \
--hostname slurmctl \
-e "MINICONDA_PATH=${MINICONDA_PATH}" \
--privileged \
-v "${PWD}:${PWD}:ro,Z" \
-v "${MINICONDA_PATH}:${MINICONDA_PATH}:Z" \
-v "${THIS_DIR}/container:/opt/outer:ro,Z" \
"${FQDN_IMAGE}" \
/bin/bash -c ". /opt/outer/walkthrough-tests.sh"

0 comments on commit a9c1aa7

Please sign in to comment.