diff --git a/.github/workflows/build-push-base.yaml b/.github/workflows/build-push-base.yaml index 1d7c194..3efd85b 100644 --- a/.github/workflows/build-push-base.yaml +++ b/.github/workflows/build-push-base.yaml @@ -38,7 +38,6 @@ concurrency: cancel-in-progress: true env: - BAKEFILE_NAME: docker-bake.xformers.hcl IMAGE_REGISTRY: ghcr.io IMAGE_PLATFORMS: "linux/amd64" @@ -69,13 +68,10 @@ jobs: torch: - torch210 - torch201 - include: - # - target: base - # cuda: cu121 - # torch: torchnightly - - target: coreweave - cuda: cu118 - torch: torch210 + # include: + # - target: base + # cuda: cu121 + # torch: torchnightly steps: - name: Checkout diff --git a/.github/workflows/build-push-coreweave.yaml b/.github/workflows/build-push-coreweave.yaml new file mode 100644 index 0000000..d357dfa --- /dev/null +++ b/.github/workflows/build-push-coreweave.yaml @@ -0,0 +1,131 @@ +name: Coreweave Image +run-name: Coreweave (via ${{ github.event_name }}) + +on: + push: + branches: + - "main" + - "release" + paths: + - ".github/workflows/build-push-coreweave.yaml" + - "docker/base/*" + - "docker-bake.hcl" + - "docker-compose.base.yml" + - "!**.md" + + workflow_dispatch: + inputs: + force-push: + description: "push to GHCR" + type: boolean + required: true + default: false + + pull_request: + paths: + - ".github/workflows/build-push-coreweave.yaml" + - "docker/base/*" + - "docker-bake.hcl" + - "docker-compose.base.yml" + - "!**.md" + +defaults: + run: + shell: bash + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-coreweave + cancel-in-progress: true + +env: + IMAGE_REGISTRY: ghcr.io + IMAGE_PLATFORMS: "linux/amd64" + + # sorry pascal/volta users but GH only lets me compile for 6 hours + TORCH_CUDA_ARCH_LIST: "7.5;8.0;8.6;8.9;9.0" + # for ninja so the runner doesn't explode + MAX_JOBS: 1 + # NVCC my behated + NVCC_THREADS: 1 + +jobs: + build: + name: Build + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + + strategy: + fail-fast: false + max-parallel: 1 + matrix: + target: + - coreweave + cuda: + - cu118 + - cu120 + torch: + - torch210 + + steps: + - name: Checkout + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v2 + with: + platforms: ${{ env.IMAGE_PLATFORMS }} + + - name: Log in to GHCR + uses: docker/login-action@v2 + with: + registry: ${{ env.IMAGE_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate docker tags + id: meta + uses: docker/metadata-action@v4 + with: + flavor: | + suffix=-${{ matrix.cuda }}-${{ matrix.torch }} + images: | + ${{ env.IMAGE_REGISTRY }}/${{ github.repository }}/${{ matrix.target }} + tags: | + type=raw,value=${{ matrix.cuda }}-${{ matrix.torch }},enable={{is_default_branch}},suffix= + type=schedule,pattern={{date 'YYYYMMDD-hhmm' tz='UTC'}},enable={{is_default_branch}} + type=sha,format=short + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + labels: | + org.opencontainers.image.title=tensorpods-base + org.opencontainers.image.description=base PyTorch nVidia CUDA image + org.opencontainers.image.vendor=neggles.dev + + - name: Free disk space + id: free-disk-space + run: | + df -h . + sudo find "$AGENT_TOOLSDIRECTORY" -delete + sudo find /usr/share/dotnet -delete + sudo find /usr/local/lib/android -delete + df -h . + + - name: Build & Push Image + id: build-push + uses: docker/bake-action@v3 + with: + targets: ${{ matrix.target }}-${{ matrix.cuda }}-${{ matrix.torch }} + files: | + ./docker-bake.hcl + ${{ steps.meta.outputs.bake-file }} + push: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/main') || inputs.force-push }} + set: | + *.cache-from=type=gha + *.cache-to=type=gha,mode=min