From 22c5858ece29b470fdfb35648c2937d10835063b Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 18 Dec 2024 12:52:03 +0100
Subject: [PATCH] Add `uv` to install `pip` dependencies faster

This commit also contains some formatting improvements to better debug
the `Dockerfile` such as indentation when a command is divided in
multiple lines to know that it refers to the unindented command above;
also set bash as the default shell, and fix `gcloud` CLI installation
---
 .../transformers/4.47.0/py311/Dockerfile      | 92 +++++++++++--------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/containers/pytorch/training/gpu/2.3.0/transformers/4.47.0/py311/Dockerfile b/containers/pytorch/training/gpu/2.3.0/transformers/4.47.0/py311/Dockerfile
index bcd5f728..d241f282 100644
--- a/containers/pytorch/training/gpu/2.3.0/transformers/4.47.0/py311/Dockerfile
+++ b/containers/pytorch/training/gpu/2.3.0/transformers/4.47.0/py311/Dockerfile
@@ -1,4 +1,5 @@
 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
+SHELL ["/bin/bash", "-c"]
 
 LABEL maintainer="Hugging Face"
 ARG DEBIAN_FRONTEND=noninteractive
@@ -20,64 +21,76 @@ ARG SENTENCE_TRANSFORMERS="3.3.1"
 ARG DEEPSPEED="0.16.1"
 ARG MAX_JOBS=4
 
-RUN apt-get update && \
+RUN apt-get update -y && \
     apt-get install software-properties-common -y && \
     add-apt-repository ppa:deadsnakes/ppa && \
     apt-get -y upgrade --only-upgrade systemd openssl cryptsetup && \
     apt-get install -y \
-    build-essential \
-    bzip2 \
-    curl \
-    git \
-    git-lfs \
-    tar \
-    gcc \
-    g++ \
-    cmake \
-    libprotobuf-dev \
-    libaio-dev \
-    protobuf-compiler \
-    python3.11 \
-    python3.11-dev \
-    libsndfile1-dev \
-    ffmpeg && \
+        build-essential \
+        bzip2 \
+        curl \
+        git \
+        git-lfs \
+        tar \
+        gcc \
+        g++ \
+        cmake \
+        gnupg \
+        libprotobuf-dev \
+        libaio-dev \
+        protobuf-compiler \
+        python3.11 \
+        python3.11-dev \
+        libsndfile1-dev \
+        ffmpeg && \
     apt-get clean autoremove --yes && \
-    rm -rf /var/lib/{apt,dpkg,cache,log}
+    rm -rf /var/lib/apt/lists/*
 
 # Set Python 3.11 as the default python version
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
     ln -sf /usr/bin/python3.11 /usr/bin/python
 
-# Install pip from source
+# Install pip from source and upgrade it
 RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
-    rm get-pip.py
+    rm get-pip.py && \
+    pip install --upgrade pip
+
+# Download the latest installer
+ADD https://astral.sh/uv/install.sh /uv-installer.sh
+
+# Run the installer then remove it
+RUN sh /uv-installer.sh && rm /uv-installer.sh
+
+# Ensure the installed binary is on the `PATH`, and use system's Python as default
+ENV PATH="/root/.local/bin/:$PATH" \
+    UV_SYSTEM_PYTHON=1
 
-# Update pip
-RUN pip install --upgrade pip
+# Set alias
+RUN printf '#!/bin/bash\nuv pip "$@"' > /usr/local/bin/pip && chmod +x /usr/local/bin/pip
 
-# Install latest release PyTorch (PyTorch must be installed before any DeepSpeed c++/cuda ops.)
-RUN pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/${CUDA}
+# Install latest release PyTorch (PyTorch must be installed before any DeepSpeed C++/CUDA ops.)
+RUN pip install --no-cache-dir --upgrade --index-url https://download.pytorch.org/whl/${CUDA} "torch==${PYTORCH}" torchvision torchaudio
 
-# Upgrade FlashAttnV2
+# Install and upgrade Flash Attention 2
 RUN pip install --no-cache-dir packaging ninja
-RUN MAX_JOBS=${MAX_JOBS} pip install flash-attn==${FLASH_ATTN} --no-build-isolation
+RUN MAX_JOBS=${MAX_JOBS} pip install --no-build-isolation flash-attn==${FLASH_ATTN}
 
 # Install Hugging Face Libraries
 RUN pip install --upgrade --no-cache-dir \
-    transformers[sklearn,sentencepiece,vision]==${TRANSFORMERS} \
-    huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB} \
-    diffusers==${DIFFUSERS} \
-    datasets==${DATASETS} \
-    accelerate==${ACCELERATE} \
-    evaluate==${EVALUATE} \
-    peft==${PEFT} \
-    trl==${TRL} \
-    sentence-transformers==${SENTENCE_TRANSFORMERS} \
-    deepspeed==${DEEPSPEED} \
-    bitsandbytes==${BITSANDBYTES} \
-    tensorboard \
-    jupyter notebook
+        "transformers[sklearn,sentencepiece,vision]==${TRANSFORMERS}" \
+        "huggingface_hub[hf_transfer]==${HUGGINGFACE_HUB}" \
+        "diffusers==${DIFFUSERS}" \
+        "datasets==${DATASETS}" \
+        "accelerate==${ACCELERATE}" \
+        "evaluate==${EVALUATE}" \
+        "peft==${PEFT}" \
+        "trl==${TRL}" \
+        "sentence-transformers==${SENTENCE_TRANSFORMERS}" \
+        "deepspeed==${DEEPSPEED}" \
+        "bitsandbytes==${BITSANDBYTES}" \
+        tensorboard \
+        jupyter notebook
 
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 
@@ -95,6 +108,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.
     | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
     curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
     | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
+    touch /var/lib/dpkg/status && \
     apt-get update -y && \
     apt-get install google-cloud-sdk -y && \
     apt-get clean autoremove --yes && \