From 8cc38d831eb94a75d6cf263fc31670560e303663 Mon Sep 17 00:00:00 2001
From: Andrew Powers-Holmes <aholmes@omnom.net>
Date: Wed, 27 Sep 2023 22:09:05 +1000
Subject: [PATCH] rework this a bit

---
 .github/workflows/build-push-gradient.yaml |  7 +-
 docker-bake.hcl                            | 98 ++++++++++++++++------
 docker/Dockerfile.base                     | 57 +++++++++----
 docker/Dockerfile.gradient                 |  8 +-
 4 files changed, 124 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/build-push-gradient.yaml b/.github/workflows/build-push-gradient.yaml
index 55a9bb0..10b2dd5 100644
--- a/.github/workflows/build-push-gradient.yaml
+++ b/.github/workflows/build-push-gradient.yaml
@@ -56,7 +56,10 @@ jobs:
         include:
           - name: "notebook"
             target: "gradient"
-            torch-ver: "2.0.1"
+            torch-ver: "torch201"
+          - name: "notebook"
+            target: "gradient"
+            torch-ver: "torch210"
 
     steps:
       - name: Checkout
@@ -119,7 +122,7 @@ jobs:
         id: build-push
         uses: docker/bake-action@v3
         with:
-          targets: ${{ matrix.target }}
+          targets: ${{ matrix.target }}-${{ matrix.torch-ver }}
           files: |
             ./docker-bake.hcl
             ${{ steps.meta.outputs.bake-file }}
diff --git a/docker-bake.hcl b/docker-bake.hcl
index 365148c..4f033de 100644
--- a/docker-bake.hcl
+++ b/docker-bake.hcl
@@ -1,49 +1,72 @@
 # docker-bake.hcl for stable-diffusion-webui
 group "default" {
-  targets = ["gradient"]
+  targets = ["gradient-torch201"]
 }
 
-variable "IMAGE_REGISTRY" {
+group torchrc {
+  targets = ["gradient-torch210"]
+}
+
+variable IMAGE_REGISTRY {
   default = "ghcr.io"
 }
 
-variable "IMAGE_NAME" {
+variable IMAGE_NAMESPACE {
   default = "neggles/psychic-paper"
 }
 
-variable "BASE_IMAGE" {
-  default = "nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04"
+variable CUDA_VERSION {
+  default = "12.1.1"
 }
 
-variable "CUDA_VERSION" {
-  default = "12.1"
+variable TORCH_CUDA_ARCH_LIST {
+  default = "7.0;7.5;8.0;8.6;8.9;9.0"
 }
 
-variable "TORCH_VERSION" {
-  default = "torch"
+# removes characters not valid in a target name, useful for other things too
+function stripName {
+  params = [name]
+  result = regex_replace(name, "[^a-zA-Z0-9_-]+", "")
 }
 
-variable "TORCH_INDEX" {
-  default = "https://download.pytorch.org/whl/cu118"
+# convert a CUDA version number and container dev type etc. into an image URI
+function cudaImage {
+  params          = [cudaVer, cudaType]
+  variadic_params = extraVals
+  result = join(":", [
+    "nvidia/cuda",
+    join("-", [cudaVer], extraVals, [cudaType, "ubuntu22.04"])
+  ])
 }
 
-variable "TORCH_CUDA_ARCH_LIST" {
-  default = "7.0;7.5;8.0;8.6;8.9;9.0"
+# convert a CUDA version number into a release number (e.g. 11.2.1 -> 11-2)
+function cudaRelease {
+  params = [version]
+  result = regex_replace(version, "^(\\d+)\\.(\\d).*", "$1-$2")
 }
 
-variable "XFORMERS_VERSION" {
-  default = "xformers==0.0.21"
+# build a tag for an image from this repo
+function repoImage {
+  params          = [imageName]
+  variadic_params = extraVals
+  result = join(":", [
+    join("/", [IMAGE_REGISTRY, IMAGE_NAMESPACE, imageName]),
+    join("-", extraVals)
+  ])
 }
 
+# set to "true" by github actions, used to disable auto-tag
+variable CI { default = "" }
+
 # docker-metadata-action will populate this in GitHub Actions
-target "docker-metadata-action" {}
+target docker-metadata-action {}
 
 # Shared amongst all containers
-target "common" {
+target common {
   context = "./docker"
   args = {
     CUDA_VERSION = CUDA_VERSION
-    CUDA_RELEASE = "${regex_replace(CUDA_VERSION, "\\.", "-")}"
+    CUDA_RELEASE = cudaRelease(CUDA_VERSION)
 
     TORCH_CUDA_ARCH_LIST = TORCH_CUDA_ARCH_LIST
   }
@@ -51,26 +74,51 @@ target "common" {
 }
 
 # Base image with cuda, python, torch, and other dependencies
-target "base" {
+target base-torch201 {
   inherits   = ["common", "docker-metadata-action"]
   dockerfile = "Dockerfile.base"
-  target     = "base"
+  target     = "base-xformers-bin"
   args = {
-    TORCH_INDEX    = TORCH_INDEX
-    TORCH_VERSION  = TORCH_VERSION
+    TORCH_INDEX    = "https://download.pytorch.org/whl/cu118"
+    TORCH_PACKAGE  = "torch"
     EXTRA_PIP_ARGS = ""
 
-    XFORMERS_VERSION = XFORMERS_VERSION
+    XFORMERS_PACKAGE = "xformers==0.0.21"
+  }
+}
+
+target base-torch210 {
+  inherits   = ["common", "docker-metadata-action"]
+  dockerfile = "Dockerfile.base"
+  target     = "base-xformers-ghcr"
+  args = {
+    TORCH_INDEX    = "https://download.pytorch.org/whl/test/cu121"
+    TORCH_PACKAGE  = "torch"
+    EXTRA_PIP_ARGS = ""
+
+    XFORMERS_PACKAGE = "ghcr.io/neggles/tensorpods/xformers:v0.0.21-cu121-torch210"
   }
 }
 
 # Paperspace Gradient image
-target "gradient" {
+target gradient-torch201 {
+  inherits   = ["common", "docker-metadata-action"]
+  dockerfile = "Dockerfile.gradient"
+  target     = "gradient"
+  contexts = {
+    base = "target:base-torch201"
+  }
+  args = {
+    NODE_MAJOR = 18
+  }
+}
+
+target gradient-torch210 {
   inherits   = ["common", "docker-metadata-action"]
   dockerfile = "Dockerfile.gradient"
   target     = "gradient"
   contexts = {
-    base = "target:base"
+    base = "target:base-torch210"
   }
   args = {
     NODE_MAJOR = 18
diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base
index 635694e..ab1f2ae 100644
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@@ -8,7 +8,7 @@ ARG BASE_IMAGE=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 ARG DEBIAN_FRONTEND=noninteractive
 ARG DEBIAN_PRIORITY=critical
 ARG PIP_PREFER_BINARY=1
-ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0"
+ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX"
 
 # Build the base image.
 FROM ${BASE_IMAGE} as base
@@ -101,12 +101,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
   && apt-get clean
 
 # Install TensorRT libraries
+ARG INCLUDE_TRT=true
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
     apt-get update \
-  && apt-get -y install --no-install-recommends \
-    libnvinfer-dev \
-    python3-libnvinfer-dev \
+  && if [ "${INCLUDE_TRT}" == "true" ]; then  \
+    apt-get -y install --no-install-recommends \
+        libnvinfer-dev \
+        python3-libnvinfer-dev \
+    ; fi \
   && apt-get clean
 
 # Install other CUDA libraries
@@ -133,22 +136,44 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
 
 # Install PyTorch
 ARG TORCH_INDEX
-ARG TORCH_VERSION
-ARG EXTRA_PIP_ARGS
+ARG TORCH_PACKAGE="torch"
+ARG TRITON_PACKAGE=" "
+ARG EXTRA_PIP_ARGS=" "
 RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    python -m pip install ${EXTRA_PIP_ARGS} \
-      --extra-index-url ${TORCH_INDEX} \
-      "${TORCH_VERSION:-torch}" \
+    python -m pip install ${EXTRA_PIP_ARGS:-} \
+      ${TORCH_PACKAGE} \
+      ${TRITON_PACKAGE} \
       torchaudio \
-      torchvision
+      torchvision \
+      --index-url "${TORCH_INDEX}"
 
-# Install xformers
-ARG XFORMERS_VERSION
+# save and enforce a constraint file to lock the torch version
 RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
-    python -m pip install "${XFORMERS_VERSION}"
+    python -m pip freeze | grep -E '(^torch|triton)' > /torch-constraints.txt
+ENV PIP_CONSTRAINT=/torch-constraints.txt
+
+# set work dir
+WORKDIR /workspace
 
-# we do a little entrypoint setup
+#
 CMD ["/bin/bash", "-l"]
 
-# Specific required versions for everything else will be installed in their respective images
-# since this stuff tends to be pretty picky about versioning.
+
+# can use this target if there's a prebuilt wheel available for this torch version
+FROM base as base-xformers-bin
+
+# Install xformers
+ARG XFORMERS_PACKAGE="xformers"
+ARG EXTRA_PIP_ARGS=""
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    python -m pip install ${EXTRA_PIP_ARGS:-} "${XFORMERS_PACKAGE}"
+
+
+# or this one if we're doing mf war crimes
+FROM base AS base-xformers-ghcr
+ARG XFORMERS_PACKAGE='ghcr.io/neggles/tensorpods/xformers:v0.0.21-cu121-torch210'
+ARG EXTRA_PIP_ARGS=""
+
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    --mount=type=bind,from=${XFORMERS_PACKAGE},source=/xformers,dst=/xformers \
+    python -m pip install ${EXTRA_PIP_ARGS:-} /xformers/xformers*.whl
diff --git a/docker/Dockerfile.gradient b/docker/Dockerfile.gradient
index 38989ca..5b50d8e 100644
--- a/docker/Dockerfile.gradient
+++ b/docker/Dockerfile.gradient
@@ -3,7 +3,7 @@
 ARG DEBIAN_FRONTEND=noninteractive
 ARG DEBIAN_PRIORITY=critical
 ARG PIP_PREFER_BINARY=1
-ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0"
+ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX"
 
 FROM base AS gradient
 
@@ -59,7 +59,9 @@ ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1
 ENV CUDA_MODULE_LOADING=LAZY
 ENV TCMALLOC_AGGRESSIVE_DECOMMIT=t
 
-# we're not changing the entrypoint since nVidia's default one works fine
+# paperspace default working dir
+WORKDIR /notebooks
 
-# default command
+# default command; nvidia's entrypoint.sh is fine so we don't touch ENTRYPOINT
+# n.b. when launching the image on gradient this will be overridden with a jupyter start command
 CMD [ "/usr/bin/env", "bash", "-l" ]