From 8cc38d831eb94a75d6cf263fc31670560e303663 Mon Sep 17 00:00:00 2001 From: Andrew Powers-Holmes Date: Wed, 27 Sep 2023 22:09:05 +1000 Subject: [PATCH] rework this a bit --- .github/workflows/build-push-gradient.yaml | 7 +- docker-bake.hcl | 98 ++++++++++++++++------ docker/Dockerfile.base | 57 +++++++++---- docker/Dockerfile.gradient | 8 +- 4 files changed, 124 insertions(+), 46 deletions(-) diff --git a/.github/workflows/build-push-gradient.yaml b/.github/workflows/build-push-gradient.yaml index 55a9bb0..10b2dd5 100644 --- a/.github/workflows/build-push-gradient.yaml +++ b/.github/workflows/build-push-gradient.yaml @@ -56,7 +56,10 @@ jobs: include: - name: "notebook" target: "gradient" - torch-ver: "2.0.1" + torch-ver: "torch201" + - name: "notebook" + target: "gradient" + torch-ver: "torch210" steps: - name: Checkout @@ -119,7 +122,7 @@ jobs: id: build-push uses: docker/bake-action@v3 with: - targets: ${{ matrix.target }} + targets: ${{ matrix.target }}-${{ matrix.torch-ver }} files: | ./docker-bake.hcl ${{ steps.meta.outputs.bake-file }} diff --git a/docker-bake.hcl b/docker-bake.hcl index 365148c..4f033de 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -1,49 +1,72 @@ # docker-bake.hcl for stable-diffusion-webui group "default" { - targets = ["gradient"] + targets = ["gradient-torch201"] } -variable "IMAGE_REGISTRY" { +group torchrc { + targets = ["gradient-torch210"] +} + +variable IMAGE_REGISTRY { default = "ghcr.io" } -variable "IMAGE_NAME" { +variable IMAGE_NAMESPACE { default = "neggles/psychic-paper" } -variable "BASE_IMAGE" { - default = "nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04" +variable CUDA_VERSION { + default = "12.1.1" } -variable "CUDA_VERSION" { - default = "12.1" +variable TORCH_CUDA_ARCH_LIST { + default = "7.0;7.5;8.0;8.6;8.9;9.0" } -variable "TORCH_VERSION" { - default = "torch" +# removes characters not valid in a target name, useful for other things too +function stripName { + params = [name] + result = regex_replace(name, "[^a-zA-Z0-9_-]+", "") } -variable "TORCH_INDEX" { - default = "https://download.pytorch.org/whl/cu118" +# convert a CUDA version number and container dev type etc. into an image URI +function cudaImage { + params = [cudaVer, cudaType] + variadic_params = extraVals + result = join(":", [ + "nvidia/cuda", + join("-", [cudaVer], extraVals, [cudaType, "ubuntu22.04"]) + ]) } -variable "TORCH_CUDA_ARCH_LIST" { - default = "7.0;7.5;8.0;8.6;8.9;9.0" +# convert a CUDA version number into a release number (e.g. 11.2.1 -> 11-2) +function cudaRelease { + params = [version] + result = regex_replace(version, "^(\\d+)\\.(\\d).*", "$1-$2") } -variable "XFORMERS_VERSION" { - default = "xformers==0.0.21" +# build a tag for an image from this repo +function repoImage { + params = [imageName] + variadic_params = extraVals + result = join(":", [ + join("/", [IMAGE_REGISTRY, IMAGE_NAMESPACE, imageName]), + join("-", extraVals) + ]) } +# set to "true" by github actions, used to disable auto-tag +variable CI { default = "" } + # docker-metadata-action will populate this in GitHub Actions -target "docker-metadata-action" {} +target docker-metadata-action {} # Shared amongst all containers -target "common" { +target common { context = "./docker" args = { CUDA_VERSION = CUDA_VERSION - CUDA_RELEASE = "${regex_replace(CUDA_VERSION, "\\.", "-")}" + CUDA_RELEASE = cudaRelease(CUDA_VERSION) TORCH_CUDA_ARCH_LIST = TORCH_CUDA_ARCH_LIST } @@ -51,26 +74,51 @@ target "common" { } # Base image with cuda, python, torch, and other dependencies -target "base" { +target base-torch201 { inherits = ["common", "docker-metadata-action"] dockerfile = "Dockerfile.base" - target = "base" + target = "base-xformers-bin" args = { - TORCH_INDEX = TORCH_INDEX - TORCH_VERSION = TORCH_VERSION + TORCH_INDEX = "https://download.pytorch.org/whl/cu118" + TORCH_PACKAGE = "torch" EXTRA_PIP_ARGS = "" - XFORMERS_VERSION = XFORMERS_VERSION + XFORMERS_PACKAGE = "xformers==0.0.21" + } +} + +target base-torch210 { + inherits = ["common", "docker-metadata-action"] + dockerfile = "Dockerfile.base" + target = "base-xformers-ghcr" + args = { + TORCH_INDEX = "https://download.pytorch.org/whl/test/cu121" + TORCH_PACKAGE = "torch" + EXTRA_PIP_ARGS = "" + + XFORMERS_PACKAGE = "ghcr.io/neggles/tensorpods/xformers:v0.0.21-cu121-torch210" } } # Paperspace Gradient image -target "gradient" { +target gradient-torch201 { + inherits = ["common", "docker-metadata-action"] + dockerfile = "Dockerfile.gradient" + target = "gradient" + contexts = { + base = "target:base-torch201" + } + args = { + NODE_MAJOR = 18 + } +} + +target gradient-torch210 { inherits = ["common", "docker-metadata-action"] dockerfile = "Dockerfile.gradient" target = "gradient" contexts = { - base = "target:base" + base = "target:base-torch210" } args = { NODE_MAJOR = 18 diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 635694e..ab1f2ae 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -8,7 +8,7 @@ ARG BASE_IMAGE=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_PRIORITY=critical ARG PIP_PREFER_BINARY=1 -ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" +ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX" # Build the base image. FROM ${BASE_IMAGE} as base @@ -101,12 +101,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ && apt-get clean # Install TensorRT libraries +ARG INCLUDE_TRT=true RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ apt-get update \ - && apt-get -y install --no-install-recommends \ - libnvinfer-dev \ - python3-libnvinfer-dev \ + && if [ "${INCLUDE_TRT}" == "true" ]; then \ + apt-get -y install --no-install-recommends \ + libnvinfer-dev \ + python3-libnvinfer-dev \ + ; fi \ && apt-get clean # Install other CUDA libraries @@ -133,22 +136,44 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ # Install PyTorch ARG TORCH_INDEX -ARG TORCH_VERSION -ARG EXTRA_PIP_ARGS +ARG TORCH_PACKAGE="torch" +ARG TRITON_PACKAGE=" " +ARG EXTRA_PIP_ARGS=" " RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ - python -m pip install ${EXTRA_PIP_ARGS} \ - --extra-index-url ${TORCH_INDEX} \ - "${TORCH_VERSION:-torch}" \ + python -m pip install ${EXTRA_PIP_ARGS:-} \ + ${TORCH_PACKAGE} \ + ${TRITON_PACKAGE} \ torchaudio \ - torchvision + torchvision \ + --index-url "${TORCH_INDEX}" -# Install xformers -ARG XFORMERS_VERSION +# save and enforce a constraint file to lock the torch version RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ - python -m pip install "${XFORMERS_VERSION}" + python -m pip freeze | grep -E '(^torch|triton)' > /torch-constraints.txt +ENV PIP_CONSTRAINT=/torch-constraints.txt + +# set work dir +WORKDIR /workspace -# we do a little entrypoint setup +# CMD ["/bin/bash", "-l"] -# Specific required versions for everything else will be installed in their respective images -# since this stuff tends to be pretty picky about versioning. + +# can use this target if there's a prebuilt wheel available for this torch version +FROM base as base-xformers-bin + +# Install xformers +ARG XFORMERS_PACKAGE="xformers" +ARG EXTRA_PIP_ARGS="" +RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ + python -m pip install ${EXTRA_PIP_ARGS:-} "${XFORMERS_PACKAGE}" + + +# or this one if we're doing mf war crimes +FROM base AS base-xformers-ghcr +ARG XFORMERS_PACKAGE='ghcr.io/neggles/tensorpods/xformers:v0.0.21-cu121-torch210' +ARG EXTRA_PIP_ARGS="" + +RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ + --mount=type=bind,from=${XFORMERS_PACKAGE},source=/xformers,dst=/xformers \ + python -m pip install ${EXTRA_PIP_ARGS:-} /xformers/xformers*.whl diff --git a/docker/Dockerfile.gradient b/docker/Dockerfile.gradient index 38989ca..5b50d8e 100644 --- a/docker/Dockerfile.gradient +++ b/docker/Dockerfile.gradient @@ -3,7 +3,7 @@ ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_PRIORITY=critical ARG PIP_PREFER_BINARY=1 -ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" +ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX" FROM base AS gradient @@ -59,7 +59,9 @@ ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1 ENV CUDA_MODULE_LOADING=LAZY ENV TCMALLOC_AGGRESSIVE_DECOMMIT=t -# we're not changing the entrypoint since nVidia's default one works fine +# paperspace default working dir +WORKDIR /notebooks -# default command +# default command; nvidia's entrypoint.sh is fine so we don't touch ENTRYPOINT +# n.b. when launching the image on gradient this will be overridden with a jupyter start command CMD [ "/usr/bin/env", "bash", "-l" ]