diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh index 010f325f..ea12ba6f 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/hold-lustre-client.sh @@ -2,5 +2,12 @@ set -exuo pipefail +dpkg_hold_with_retry() { + # Retry when dpkg frontend is locked + for (( i=0; i<=20; i++ )); do + echo "$1 hold" | sudo dpkg --set-selections && break || { echo To retry... ; sleep 6 ; } + done +} + # Don't let new lustre client module brings in new kernel. -echo "lustre-client-modules-aws hold" | sudo dpkg --set-selections +dpkg_hold_with_retry lustre-client-modules-aws diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh index 10774032..86a39d5f 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/hotfix/mock-gpu-driver-deb.sh @@ -14,5 +14,13 @@ apt-cache show ${MOCK_PKG}=${DRV_VERSION}-0ubuntu1 \ &> ${MOCK_PKG} equivs-build ${MOCK_PKG} -dpkg -i ${MOCK_PKG}_*.deb -echo "${MOCK_PKG} hold" | sudo dpkg --set-selections +apt install -y -o DPkg::Lock::Timeout=120 ./${MOCK_PKG}_*.deb + +dpkg_hold_with_retry() { + # Retry when dpkg frontend is locked + for (( i=0; i<=20; i++ )); do + echo "$1 hold" | sudo dpkg --set-selections && break || { echo To retry... ; sleep 6 ; } + done +} +dpkg_hold_with_retry ${MOCK_PKG} +