From 9aedae197f00425ed1fb84c84ebff8257373d58a Mon Sep 17 00:00:00 2001 From: Oksana Shadura Date: Tue, 12 Dec 2023 13:49:06 +0100 Subject: [PATCH 1/2] Update distributed settings (inspired by rwth-aachen) --- docker/Dockerfile.cc-ubuntu | 154 ------------------------------------ docker/dask/dask.yaml | 17 ++++ 2 files changed, 17 insertions(+), 154 deletions(-) delete mode 100644 docker/Dockerfile.cc-ubuntu diff --git a/docker/Dockerfile.cc-ubuntu b/docker/Dockerfile.cc-ubuntu deleted file mode 100644 index a25e3508..00000000 --- a/docker/Dockerfile.cc-ubuntu +++ /dev/null @@ -1,154 +0,0 @@ -ARG TAG="development" -ARG PROJECT="coffea-casa" -ARG NAME="${PROJECT}/cc-base-ubuntu" -ARG REGISTRY="hub.opensciencegrid.org" -FROM ${REGISTRY}/${NAME}:${TAG} - -# https://github.com/jupyter/docker-stacks/blob/master/base-notebook/Dockerfile - -# Fix DL4006 -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -USER root -LABEL maintainer="Oksana Shadura " -# Tag -# Tag -ARG TAG -ARG PROJECT -ARG REGISTRY -ARG DEV -ARG WORKER_IMAGE="${REGISTRY}/${PROJECT}/cc-analysis-ubuntu" -# Secrets -ARG CERT_DIR="/etc/cmsaf-secrets" -# FIX ME AFTER TEST: -ARG BEARER_TOKEN_FILE="/etc/cmsaf-secrets-chown/access_token" -# Configure Labextention Dask Cluster factory -ARG DASK_ROOT_CONFIG="/opt/dask" -ARG LABEXTENTION_CLUSTER="UNL HTCondor Cluster" -ARG LABEXTENTION_FACTORY_CLASS="CoffeaCasaCluster" -ARG LABEXTENTION_FACTORY_MODULE="coffea_casa" -# Condor settings -ARG CONDOR_HOST="red-condor1.unl.edu" -ARG COLLECTOR_NAME="Nebraska T2" -ARG UID_DOMAIN="unl.edu" -ARG SCHEDD_HOST="t3.unl.edu" -# XCACHE -ARG XCACHE_HOST="red-xcache1.unl.edu" - -# Hack for GH Actions -ARG GITHUB_ACTIONS="false" - -# Configure environment -ENV CERT_DIR=$CERT_DIR -ENV TAG=$TAG -ENV XCACHE_HOST=$XCACHE_HOST -ENV WORKER_IMAGE=$WORKER_IMAGE -ENV BEARER_TOKEN_FILE=$BEARER_TOKEN_FILE -ENV DASK_ROOT_CONFIG=$DASK_ROOT_CONFIG -ENV LABEXTENTION_CLUSTER=$LABEXTENTION_CLUSTER -ENV LABEXTENTION_FACTORY_CLASS=$LABEXTENTION_FACTORY_CLASS -ENV LABEXTENTION_FACTORY_MODULE=$LABEXTENTION_FACTORY_MODULE -ENV CONDOR_HOST=$CONDOR_HOST -ENV COLLECTOR_NAME=$COLLECTOR_NAME -ENV UID_DOMAIN=$UID_DOMAIN -ENV SCHEDD_HOST=$SCHEDD_HOST - -USER ${NB_USER} -RUN pip install --no-cache-dir \ - correctionlib \ - funcx \ - pyyaml \ - # visualization - Shapely==1.8.1.post1 \ - descartes==1.1.0 \ - # JSON processor - jq \ - # ML packages - dask-ml \ - prometheus_client \ - comm>=0.1.2 \ - mlflow - -RUN if [[ -z "$DEV" ]] ; then pip install --no-cache-dir coffea_casa -U ; else pip install --no-cache-dir git+https://github.com/CoffeaTeam/coffea-casa.git#egg=coffea_casa; fi - -# ------- xrootd-xcache-plugin ------------------------------- -RUN cd /tmp && \ - git clone -b xcache https://github.com/jthiltges/xrdcl-authz-plugin.git && \ - cd xrdcl-authz-plugin && \ - mkdir build && \ - cd build && \ - cmake /tmp/xrdcl-authz-plugin -DCMAKE_INSTALL_PREFIX=${CONDA_DIR} && \ - make && \ - make install - -ENV XRD_PLUGINCONFDIR="${CONDA_DIR}/etc/xrootd/client.plugins.d/" -ENV XRD_PLUGIN="${CONDA_DIR}/lib/libXrdClXcachePlugin-5.so" - -# REMOVE THIS BLOCK AFTER TEST: -# ------- xrootd-authz-plugin ------------------------------- -#RUN cd /tmp && \ -# # ------- xrdcl-authz-plugin ------------------------------- -# git clone https://github.com/bbockelm/xrdcl-authz-plugin.git && \ -# cd xrdcl-authz-plugin && \ -# mkdir build && \ -# cd build && \ -# cmake /tmp/xrdcl-authz-plugin -DCMAKE_INSTALL_PREFIX=${CONDA_DIR} && \ -# make && \ -# make install && \ -# ln -s ${CONDA_DIR}/lib/libXrdClAuthzPlugin-5.so ${CONDA_DIR}/lib/libXrdClAuthzPlugin.so - -#ENV XRD_PLUGINCONFDIR="${CONDA_DIR}/etc/xrootd/client.plugins.d/" -#ENV XRD_PLUGIN="${CONDA_DIR}/lib/libXrdClAuthzPlugin.so" -# Patching uproot (broken xrootd-authz-plugin with xrootd > 5.2.0 and uproot.MultithreadedXRootDSource) -#COPY uproot/uproot_xrd_source.patch /opt/conda/lib/python3.8/site-packages/uproot -#RUN cd /opt/conda/lib/python3.8/site-packages/uproot && patch < uproot_xrd_source.patch - -RUN chmod 755 /etc/grid-security/certificates -COPY certs/hcc-flatiron.pem /etc/grid-security/certificates/ -RUN ln -s /etc/grid-security/certificates/hcc-flatiron.pem /etc/grid-security/certificates/80d1fda9.0 - -# Coffea_casa - > jobqueue-coffea-casa.yaml -COPY dask/jobqueue-coffea-casa.yaml dask/dask_tls.yaml ${DASK_ROOT_CONFIG}/ - -USER root - -# REMOVE ME AFTER TEST: -# Patching uproot (broken xrootd-authz-plugin with xrootd > 5.2.0 and uproot.MultithreadedXRootDSource) -#COPY uproot/uproot_xrd_source.patch /opt/conda/lib/python3.8/site-packages/uproot -#RUN cd /opt/conda/lib/python3.8/site-packages/uproot && patch < uproot_xrd_source.patch - -# Distributed: we need to install patched version of distributed version -COPY dask/distributed ${CONDA_DIR}/lib/python3.9/site-packages/distributed -RUN cd ${CONDA_DIR}/lib/python3.9/site-packages/distributed && \ - patch -p2 < 0001-Patch-from-bbockelman-adaptive-scaling.patch && \ - patch -p2 < 0002-Allow-scheduler-to-preserve-worker-hostnames.patch && \ - #patch -p2 < 0003-Activate-patch.patch && \ - #patch -p2 < 0004-Add-possibility-to-setup-external_adress-for-schedul.patch && \ - #patch -p2 < 0005-Add-nanny-patch.patch - -# Cleanup -RUN rm -rf /tmp/* \ - && rm -rf $HOME/.cache/.pip/* \ - && mamba clean --all -f -y \ - && jupyter lab clean \ - && jlpm cache clean \ - && npm cache clean --force \ - && find ${CONDA_DIR} -type f -name '*.a' -delete \ - && find ${CONDA_DIR} -type f -name '*.pyc' -delete \ - && find ${CONDA_DIR} -type f -name '*.js.map' -delete \ - && (find ${CONDA_DIR}/lib/python*/site-packages/bokeh/server/static -type f,l -name '*.js' -not -name '*.min.js' -delete || echo "no bokeh static files to cleanup") \ - && rm -rf ${CONDA_DIR}/pkgs - -# FIXME: add better layering for preparation of env -ADD prepare-env/prepare-env-cc.sh /usr/local/bin/prepare-env.sh -RUN chmod ugo+x /usr/local/bin/prepare-env.sh - -# Switch back to cms-jovyan to avoid accidental container runs as root -USER ${NB_UID} -WORKDIR $HOME -ENTRYPOINT ["tini", "-g", "--", "/usr/local/bin/prepare-env.sh"] - -# Extra packages to be installed (apt, pip, conda) and commands to be executed -# Use bash login shell for entrypoint in order -# to automatically source user's .bashrc -CMD ["start-notebook.sh"] diff --git a/docker/dask/dask.yaml b/docker/dask/dask.yaml index 3f289309..18741428 100644 --- a/docker/dask/dask.yaml +++ b/docker/dask/dask.yaml @@ -1,4 +1,21 @@ distributed: + scheduler: + allowed-failures: 10 + bandwidth: 1000000000 + work-stealing: False + worker: + memory: + target: 0.7 + spill: 0.9 + pause: 0.92 + terminate: 0 + profile: + interval: 1d + cycle: 2d + low-level: False + diagnostics: + nvml: False + version: 2 dashboard: From 21907bc4dc4a0d792472d1d8ba3a5852e5987cf3 Mon Sep 17 00:00:00 2001 From: Oksana Shadura Date: Tue, 12 Dec 2023 13:49:57 +0100 Subject: [PATCH 2/2] Trying with enabled nanny back --- docker/k8s-worker/supervisord.conf | 2 +- docker/prepare-env/prepare-env-cc-analysis.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/k8s-worker/supervisord.conf b/docker/k8s-worker/supervisord.conf index 1650f125..7fe1ff31 100644 --- a/docker/k8s-worker/supervisord.conf +++ b/docker/k8s-worker/supervisord.conf @@ -69,7 +69,7 @@ serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket ; Adding (ENV_WORKER_ID) should fixes https://github.com/CoffeaTeam/jhub/issues/37 ; Add later: --nanny-port 8001 --nanny-contact-address tls://%(ENV_WORKER_IP)s:8001 [program:dask-worker] -command=/bin/bash -c "dask-worker tls://%(ENV_HOST_IP)s:8786 --nthreads 2 --tls-ca-file %(ENV_CERT_DIR)s/..data/ca.pem --tls-cert %(ENV_CERT_DIR)s/..data/hostcert.pem --tls-key %(ENV_CERT_DIR)s/..data/hostcert.pem --nanny --listen-address tls://0.0.0.0:8788 --name kubernetes-worker-%(ENV_WORKER_ID)s --contact-address tls://%(ENV_WORKER_IP)s:8788" +command=/bin/bash -c "dask-worker tls://%(ENV_HOST_IP)s:8786 --nthreads 1 --tls-ca-file %(ENV_CERT_DIR)s/..data/ca.pem --tls-cert %(ENV_CERT_DIR)s/..data/hostcert.pem --tls-key %(ENV_CERT_DIR)s/..data/hostcert.pem --nanny --listen-address tls://0.0.0.0:8788 --name kubernetes-worker-%(ENV_WORKER_ID)s --contact-address tls://%(ENV_WORKER_IP)s:8788" startretries=100 autostart=true autorestart=true diff --git a/docker/prepare-env/prepare-env-cc-analysis.sh b/docker/prepare-env/prepare-env-cc-analysis.sh index 03b75bc8..fcd66734 100644 --- a/docker/prepare-env/prepare-env-cc-analysis.sh +++ b/docker/prepare-env/prepare-env-cc-analysis.sh @@ -142,7 +142,7 @@ if [[ ! -v COFFEA_CASA_SIDECAR ]]; then --tls-key $FILE_KEY \ --nthreads $CPUS \ --memory-limit $MEMORY_MB_FORMATTED \ - --no-nanny \ + --nanny \ --death-timeout 60 \ --protocol tls \ --lifetime 7200 \