diff --git a/.github/workflows/build-ngc-xformers.yaml b/.github/workflows/build-ngc-xformers.yaml index c45c0cd..0a1e1a5 100644 --- a/.github/workflows/build-ngc-xformers.yaml +++ b/.github/workflows/build-ngc-xformers.yaml @@ -46,13 +46,14 @@ concurrency: cancel-in-progress: true env: - IMAGE_REGISTRY: ghcr.io IMAGE_NAME: xformers - IMAGE_NAMESPACE: neggles/tensorpods + IMAGE_REGISTRY: ghcr.io + IMAGE_NAMESPACE: ${{ github.repository }} IMAGE_PLATFORMS: "linux/amd64" + BAKEFILE_NAME: docker-bake.xformers.hcl - # sorry pascal users but your cards are no good here - TORCH_CUDA_ARCH_LIST: "7.0;7.5;8.0;8.6;8.9;9.0" + # sorry pascal/volta users but GH only lets me compile for 6 hours + TORCH_CUDA_ARCH_LIST: "7.5;8.0;8.6;8.9;9.0" # for ninja so the runner doesn't explode MAX_JOBS: 1 # NVCC my behated @@ -71,17 +72,21 @@ jobs: max-parallel: 1 matrix: include: - - target: "xformers" + - target: "xformers-v0021-ngc2308" ngc-ver: "23.08" - xformers-ver: "tensorpods-v0.0.21" + xformers-ver: "v0.0.21" - - target: "xformers" + - target: "xformers-v0021-ngc2307" + ngc-ver: "23.07" + xformers-ver: "v0.0.21" + + - target: "xformers-dev-ngc2308" ngc-ver: "23.08" - xformers-ver: "tensorpods" + xformers-ver: "dev" - - target: "xformers" + - target: "xformers-dev-ngc2307" ngc-ver: "23.07" - xformers-ver: "tensorpods" + xformers-ver: "dev" steps: - name: Checkout @@ -103,18 +108,19 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Print bakefile before metadata-action + run: docker buildx bake -f ./${{ env.BAKEFILE_NAME }} --print + - name: Generate docker tags id: meta uses: docker/metadata-action@v4 with: flavor: | - suffix=-ngc${{ matrix.ngc-ver }} + suffix=-${{ matrix.xformers-ver }}-ngc${{ matrix.ngc-ver }} images: | - ${{ env.IMAGE_REGISTRY }}/${{ github.repository }}/${{ matrix.target }} + ${{ env.IMAGE_REGISTRY }}/${{ env.IMAGE_NAMESPACE }}/${{ env.IMAGE_NAME }} tags: | - type=raw,value=latest,enable={{is_default_branch}} - type=raw,value=${{ matrix.xformers-ver }},enable={{is_default_branch}} - type=schedule,pattern={{date 'YYYYMMDD-hhmm' tz='UTC'}},enable={{is_default_branch}} + type=raw,value=${{ matrix.xformers-ver }},enable={{is_default_branch}},suffix=-ngc${{ matrix.ngc-ver }} type=sha,format=short type=ref,event=tag type=ref,event=pr @@ -135,13 +141,10 @@ jobs: - name: Build & Push Image id: build-push uses: docker/bake-action@v3 - env: - NGC_VERSION: ${{ matrix.ngc-ver }} - XFORMERS_REF: ${{ matrix.xformers-ver }} with: targets: ${{ matrix.target }} files: | - ./docker-bake.xformers.hcl + ./${{ env.BAKEFILE_NAME }} ${{ steps.meta.outputs.bake-file }} push: ${{ contains(fromJSON('["push", "schedule"]'), github.event_name) || inputs.force-push }} set: | diff --git a/docker-bake.xformers.hcl b/docker-bake.xformers.hcl index cb31f2c..b844fd1 100644 --- a/docker-bake.xformers.hcl +++ b/docker-bake.xformers.hcl @@ -1,71 +1,140 @@ # docker-bake.hcl for tensorpod builds group "default" { - targets = ["base"] + targets = ["xformers"] } variable "IMAGE_REGISTRY" { default = "ghcr.io" } -variable "IMAGE_NAMESPACE" { +variable IMAGE_NAMESPACE { default = "neggles/tensorpods" } -variable "IMAGE_NAME" { - default = "xformers" +variable TORCH_CUDA_ARCH_LIST { + # sorry pascal users but your cards are no good here + # n.b. in GH builds volta is not available due to compile timeouts + default = "7.0;7.5;8.0;8.6;8.9;9.0" } -variable "NGC_VERSION" { - default = "23.08" +variable MAX_JOBS { + default = "8" } -variable "XFORMERS_REPO" { - default = "https://github.com/neggles/xformers.git" +variable "NVCC_THREADS" { + default = "1" } -variable "XFORMERS_REF" { - default = "tensorpods" +# removes characters not valid in a target name, useful for other things too +function stripName { + params = [name] + result = regex_replace(name, "[^a-zA-Z0-9_-]+", "") } -variable "TORCH_CUDA_ARCH_LIST" { - # sorry pascal users but your cards are no good here - default = "7.0;7.5;8.0;8.6;8.9;9.0" +# convert a CUDA version number and container dev type etc. into an image URI +function cudaImage { + params = [cudaVer, cudaType] + variadic_params = extraVals + result = join(":", [ + "nvidia/cuda", + join("-", [cudaVer], extraVals, [cudaType, "ubuntu22.04"]) + ]) } -variable "MAX_JOBS" { - default = "1" +# convert a CUDA version number into a shortname (e.g. 11.2.1 -> cu112) +function cudaName { + params = [version] + result = regex_replace(version, "^(\\d+)\\.(\\d).*", "cu$1$2") } -function "imagetag" { - params = [imagename, tag] - result = "${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/${imagename}:${tag}-ngc${NGC_VERSION}" +# convert a CUDA version number into a release number (e.g. 11.2.1 -> 11-2) +function cudaRelease { + params = [version] + result = regex_replace(version, "^(\\d+)\\.(\\d).*", "$1-$2") } +# torch version to torch name +function torchName { + params = [version] + result = regex_replace(version, "^(\\d+)\\.(\\d+)\\.(\\d+).*", "torch$1$2$3") +} + +# build a tag for an image from this repo +function repoImage { + params = [imageName] + variadic_params = extraVals + result = join(":", [ + join("/", [IMAGE_REGISTRY, IMAGE_NAMESPACE, imageName]), + join("-", extraVals) + ]) +} + +# set to "true" by github actions, used to disable auto-tag +variable "CI" { default = "" } + # docker-metadata-action will populate this in GitHub Actions target "docker-metadata-action" {} # Shared amongst all containers target "common" { - context = "." - contexts = { - ngc = "docker-image://nvcr.io/nvidia/pytorch:${NGC_VERSION}-py3" - } + context = "." + dockerfile = "Dockerfile" args = { - XFORMERS_IMAGE = "xformers" - BASE_IMAGE = "ngc" + TORCH_CUDA_ARCH_LIST = TORCH_CUDA_ARCH_LIST + MAX_JOBS = MAX_JOBS + NVCC_THREADS = NVCC_THREADS } platforms = ["linux/amd64"] + output = [ + "type=docker", + ] } target "xformers" { - inherits = ["common", "docker-metadata-action"] - context = "docker/xformers" - dockerfile = "Dockerfile" - target = "xformers" + name = stripName("xformers-${xformers.version}-${base.type}${base.version}") + inherits = ["common", "docker-metadata-action"] + context = "docker/xformers" + target = "xformers" + contexts = { + base-ngc = "docker-image://nvcr.io/nvidia/pytorch:${base.version}-py3" + base-torch = "docker-image://${repoImage("base", cudaName(base.cuda), base.type, base.version)}" + } + matrix = { + base = [ + { + type = "ngc" + version = "23.08" + cuda = "12.2.1" + }, + { + type = "ngc" + version = "23.07" + cuda = "12.1.1" + }, + ], + xformers = [ + { + version = "v0.0.21", + repo = "https://github.com/neggles/xformers.git" + ref = "tensorpods-v0.0.21" + buildtype = "release" + }, + { + version = "dev" + repo = "https://github.com/neggles/xformers.git" + ref = "tensorpods" + buildtype = "release" + } + ] + } + tags = [ + notequal("true", CI) ? repoImage("xformers", xformers.version, "${base.type}${base.version}") : "" + ] args = { - XFORMERS_REPO = XFORMERS_REPO - XFORMERS_REF = XFORMERS_REF - TORCH_CUDA_ARCH_LIST = TORCH_CUDA_ARCH_LIST - MAX_JOBS = MAX_JOBS + BASE_IMAGE = "base-${base.type}" + + XFORMERS_REPO = xformers.repo + XFORMERS_REF = xformers.ref + XFORMERS_BUILD_TYPE = xformers.buildtype } }