Skip to content

Commit

Permalink
rework build, split coreweave into its own workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
neggles committed Dec 31, 2023
1 parent d5782f2 commit 1867cbd
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 8 deletions.
12 changes: 4 additions & 8 deletions .github/workflows/build-push-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ concurrency:
cancel-in-progress: true

env:
BAKEFILE_NAME: docker-bake.xformers.hcl
IMAGE_REGISTRY: ghcr.io
IMAGE_PLATFORMS: "linux/amd64"

Expand Down Expand Up @@ -69,13 +68,10 @@ jobs:
torch:
- torch210
- torch201
include:
# - target: base
# cuda: cu121
# torch: torchnightly
- target: coreweave
cuda: cu118
torch: torch210
# include:
# - target: base
# cuda: cu121
# torch: torchnightly

steps:
- name: Checkout
Expand Down
131 changes: 131 additions & 0 deletions .github/workflows/build-push-coreweave.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
name: Coreweave Image
run-name: Coreweave (via ${{ github.event_name }})

on:
push:
branches:
- "main"
- "release"
paths:
- ".github/workflows/build-push-coreweave.yaml"
- "docker/base/*"
- "docker-bake.hcl"
- "docker-compose.base.yml"
- "!**.md"

workflow_dispatch:
inputs:
force-push:
description: "push to GHCR"
type: boolean
required: true
default: false

pull_request:
paths:
- ".github/workflows/build-push-coreweave.yaml"
- "docker/base/*"
- "docker-bake.hcl"
- "docker-compose.base.yml"
- "!**.md"

defaults:
run:
shell: bash

concurrency:
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-coreweave
cancel-in-progress: true

env:
IMAGE_REGISTRY: ghcr.io
IMAGE_PLATFORMS: "linux/amd64"

# sorry pascal/volta users but GH only lets me compile for 6 hours
TORCH_CUDA_ARCH_LIST: "7.5;8.0;8.6;8.9;9.0"
# for ninja so the runner doesn't explode
MAX_JOBS: 1
# NVCC my behated
NVCC_THREADS: 1

jobs:
build:
name: Build
runs-on: ubuntu-latest
permissions:
packages: write
contents: read

strategy:
fail-fast: false
max-parallel: 1
matrix:
target:
- coreweave
cuda:
- cu118
- cu120
torch:
- torch210

steps:
- name: Checkout
id: checkout
uses: actions/checkout@v3
with:
submodules: recursive

- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v2
with:
platforms: ${{ env.IMAGE_PLATFORMS }}

- name: Log in to GHCR
uses: docker/login-action@v2
with:
registry: ${{ env.IMAGE_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Generate docker tags
id: meta
uses: docker/metadata-action@v4
with:
flavor: |
suffix=-${{ matrix.cuda }}-${{ matrix.torch }}
images: |
${{ env.IMAGE_REGISTRY }}/${{ github.repository }}/${{ matrix.target }}
tags: |
type=raw,value=${{ matrix.cuda }}-${{ matrix.torch }},enable={{is_default_branch}},suffix=
type=schedule,pattern={{date 'YYYYMMDD-hhmm' tz='UTC'}},enable={{is_default_branch}}
type=sha,format=short
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
labels: |
org.opencontainers.image.title=tensorpods-base
org.opencontainers.image.description=base PyTorch nVidia CUDA image
org.opencontainers.image.vendor=neggles.dev
- name: Free disk space
id: free-disk-space
run: |
df -h .
sudo find "$AGENT_TOOLSDIRECTORY" -delete
sudo find /usr/share/dotnet -delete
sudo find /usr/local/lib/android -delete
df -h .
- name: Build & Push Image
id: build-push
uses: docker/bake-action@v3
with:
targets: ${{ matrix.target }}-${{ matrix.cuda }}-${{ matrix.torch }}
files: |
./docker-bake.hcl
${{ steps.meta.outputs.bake-file }}
push: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/main') || inputs.force-push }}
set: |
*.cache-from=type=gha
*.cache-to=type=gha,mode=min

0 comments on commit 1867cbd

Please sign in to comment.