Merge pull request #1034 from StanfordVL/docker-cuda-install-first

Update Docker to uninstall cuda toolkit & start pushing actions runner image on CI too
StanfordVL · Nov 22, 2024 · bae1015 · bae1015
2 parents 075267e + e4d6891
commit bae1015
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 16 deletions.
diff --git a/.github/workflows/build-push-containers.yml b/.github/workflows/build-push-containers.yml
@@ -22,7 +22,7 @@ jobs:
           sudo rm -rf \
             /usr/share/dotnet /usr/local/lib/android /opt/ghc \
             /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \
-            /usr/lib/jvm || true
+            /usr/lib/jvm /opt/hostedtoolcache/CodeQL || true
           echo "some directories deleted"
           sudo apt install aptitude -y >/dev/null 2>&1
           sudo aptitude purge aria2 ansible azure-cli shellcheck rpm xorriso zsync \
@@ -97,6 +97,18 @@ jobs:
           tags: |
             type=ref,event=branch
             type=semver,pattern={{version}}
+      -
+        name: Metadata for actions Image
+        id: meta-actions
+        uses: docker/metadata-action@v5
+        # The actions image should only be built if the push is to og-develop
+        if: github.ref == 'refs/heads/og-develop'
+        with:
+          images: |
+            stanfordvl/omnigibson-gha
+          tags: |
+            # We only push to the latest tag for the actions image
+            type=raw,value=latest
       -
         name: Build and push prod image
         id: build-prod
@@ -107,8 +119,8 @@ jobs:
           tags: ${{ steps.meta-prod.outputs.tags }}
           labels: ${{ steps.meta-prod.outputs.labels }}
           file: docker/prod.Dockerfile
-          cache-from: type=registry,ref=stanfordvl/omnigibson:og-develop
-          cache-to: type=inline
+          cache-from: type=registry,ref=stanfordvl/omnigibson:build-cache
+          cache-to: type=registry,ref=stanfordvl/omnigibson:build-cache,mode=max
 
       -
         name: Build and push dev image
@@ -121,8 +133,8 @@ jobs:
           tags: ${{ steps.meta-dev.outputs.tags }}
           labels: ${{ steps.meta-dev.outputs.labels }}
           file: docker/prod.Dockerfile
-          cache-from: type=registry,ref=stanfordvl/omnigibson:og-develop  # OK to share cache here.
-          cache-to: type=inline
+          cache-from: type=registry,ref=stanfordvl/omnigibson:build-cache  # OK to share cache here.
+          cache-to: type=registry,ref=stanfordvl/omnigibson:build-cache,mode=max
 
       - name: Update vscode image Dockerfile with prod image tag
         run: |
@@ -137,5 +149,25 @@ jobs:
           tags: ${{ steps.meta-vscode.outputs.tags }}
           labels: ${{ steps.meta-vscode.outputs.labels }}
           file: docker/vscode.Dockerfile
-          cache-from: type=registry,ref=stanfordvl/omnigibson:og-develop  # OK to share cache here.
-          cache-to: type=inline
+          cache-from: type=registry,ref=stanfordvl/omnigibson:build-cache  # OK to share cache here.
+          cache-to: type=registry,ref=stanfordvl/omnigibson:build-cache,mode=max
+
+      - name: Update actions image Dockerfile with dev image tag
+        # The actions image should only be built if the push is to og-develop
+        if: github.ref == 'refs/heads/og-develop'
+        run: |
+          sed -i "s/omnigibson-dev:og-develop/omnigibson-dev@${{ steps.build-dev.outputs.digest }}/g" docker/gh-actions/Dockerfile && cat docker/gh-actions/Dockerfile
+      -
+        name: Build and push actions image
+        id: build-actions
+        uses: docker/build-push-action@v5
+        # The actions image should only be built if the push is to og-develop
+        if: github.ref == 'refs/heads/og-develop'
+        with:
+          context: docker/gh-actions
+          push: true
+          tags: ${{ steps.meta-actions.outputs.tags }}
+          labels: ${{ steps.meta-actions.outputs.labels }}
+          file: docker/gh-actions/Dockerfile
+          cache-from: type=registry,ref=stanfordvl/omnigibson:build-cache  # OK to share cache here.
+          cache-to: type=registry,ref=stanfordvl/omnigibson:build-cache,mode=max
diff --git a/docker/prod.Dockerfile b/docker/prod.Dockerfile
@@ -30,20 +30,23 @@ RUN micromamba run -n omnigibson micromamba install \
   pytorch torchvision pytorch-cuda=11.8 \
   -c pytorch -c nvidia -c conda-forge
 
-# Install cuda for compiling curobo
-RUN wget -O /cuda.run https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run && \
-  sh /cuda.run --silent --toolkit && rm /cuda.run
-ENV PATH=/usr/local/cuda-11.8/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64:$LD_LIBRARY_PATH
-
 # Install curobo. This can normally be installed when OmniGibson is pip
 # installed, but we need to install it beforehand here so that it doesn't
 # have to happen on every time a CI action is run (otherwise it's just
-# very slow)
+# very slow).
+# This also allows us to uninstall the cuda toolkit after curobo is built
+# to save space (meaning curobo will not be able to be rebuilt at runtime).
 # Here we also compile this such that it is compatible with GPU architectures
 # Turing, Ampere, and Ada; which correspond to 20, 30, and 40 series GPUs.
-RUN TORCH_CUDA_ARCH_LIST='7.5;8.0;8.6+PTX' \
-  micromamba run -n omnigibson pip install git+https://github.com/StanfordVL/curobo@06d8c79b660db60c2881e9319e60899cbde5c5b5#egg=nvidia_curobo --no-build-isolation
+# We also suppress the output of the installation to avoid the log limit.
+RUN wget --no-verbose -O /cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ 
+  dpkg -i /cuda-keyring.deb && rm /cuda-keyring.deb && apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y cuda-toolkit-11-8 && \
+  TORCH_CUDA_ARCH_LIST='7.5;8.0;8.6+PTX' PATH=/usr/local/cuda-11.8/bin:$PATH LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64:$LD_LIBRARY_PATH \
+    micromamba run -n omnigibson pip install \
+    git+https://github.com/StanfordVL/curobo@06d8c79b660db60c2881e9319e60899cbde5c5b5#egg=nvidia_curobo \
+    --no-build-isolation > /dev/null && \
+  apt-get remove -y cuda-toolkit-11-8 && apt-get autoremove -y && apt-get autoclean -y && rm -rf /var/lib/apt/lists/*
 
 # Make sure isaac gets properly sourced every time omnigibson gets called
 ARG CONDA_ACT_FILE="/micromamba/envs/omnigibson/etc/conda/activate.d/env_vars.sh"