[checkpointio] fix zero optimizer async save memory #8784
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Build on PR | |
on: | |
pull_request: | |
types: [synchronize, opened, reopened, ready_for_review, closed] | |
branches: | |
- "main" | |
- "develop" | |
- "feature/**" | |
paths: | |
- ".github/workflows/build_on_pr.yml" # run command & env variables change | |
- "colossalai/**" # source code change | |
- "!colossalai/**.md" # ignore doc change | |
- "op_builder/**" # cuda extension change | |
- "!op_builder/**.md" # ignore doc change | |
- "requirements/**" # requirements change | |
- "tests/**" # test change | |
- "!tests/**.md" # ignore doc change | |
- "pytest.ini" # test config change | |
- "setup.py" # install command change | |
create: | |
delete: | |
jobs: | |
detect: | |
name: Detect file change | |
if: | | |
github.event_name == 'pull_request' && | |
(github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') && | |
github.event.pull_request.draft == false && | |
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' | |
outputs: | |
changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }} | |
anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} | |
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} | |
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} | |
runs-on: ubuntu-latest | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change | |
cancel-in-progress: true | |
steps: | |
- uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
ref: ${{ github.event.pull_request.head.sha }} | |
- name: Locate base commit | |
id: locate-base-sha | |
run: | | |
curBranch=$(git rev-parse --abbrev-ref HEAD) | |
commonCommit=$(git merge-base origin/main $curBranch) | |
echo $commonCommit | |
echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT | |
- name: Find the changed extension-related files | |
id: find-extension-change | |
uses: tj-actions/changed-files@v35 | |
with: | |
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} | |
files: | | |
op_builder/** | |
colossalai/kernel/** | |
setup.py | |
- name: Find the changed library-related files | |
id: find-lib-change | |
uses: tj-actions/changed-files@v35 | |
with: | |
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} | |
files: | | |
**/*.py | |
**/*.h | |
**/*.cpp | |
**/*.cu | |
**/*.txt | |
- name: List changed files | |
run: | | |
for file in ${{ steps.find-extension-change.outputs.all_changed_files }}; do | |
echo "$file was changed" | |
done | |
for file in ${{ steps.find-lib-change.outputs.all_changed_files }}; do | |
echo "$file was changed" | |
done | |
build: | |
name: Build and Test Colossal-AI | |
needs: detect | |
if: needs.detect.outputs.anyLibraryFileChanged == 'true' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 | |
options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch | |
timeout-minutes: 90 | |
defaults: | |
run: | |
shell: bash | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test | |
cancel-in-progress: true | |
steps: | |
- name: Checkout TensorNVMe | |
uses: actions/checkout@v2 | |
with: | |
repository: hpcaitech/TensorNVMe | |
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} | |
path: TensorNVMe | |
- name: Restore TensorNVMe Cache | |
run: | | |
if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then | |
cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe | |
fi | |
- name: Install TensorNVMe | |
run: | | |
cd TensorNVMe | |
conda install cmake | |
pip install -r requirements.txt | |
DISABLE_URING=1 pip install -v --no-cache-dir . | |
- name: Store TensorNVMe Cache | |
run: | | |
cd TensorNVMe | |
cp -p -r ./build /github/home/tensornvme_cache/ | |
cp -p -r ./cmake-build /github/home/tensornvme_cache/ | |
- name: Checkout Colossal-AI | |
uses: actions/checkout@v2 | |
with: | |
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} | |
- name: Restore Colossal-AI Cache | |
if: needs.detect.outputs.anyExtensionFileChanged != 'true' | |
run: | | |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild | |
if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then | |
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ | |
fi | |
- name: Install Colossal-AI | |
run: | | |
BUILD_EXT=1 pip install -v -e . | |
pip install --no-cache-dir -r requirements/requirements-test.txt | |
- name: Store Colossal-AI Cache | |
run: | | |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild | |
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ | |
- name: Execute Unit Testing | |
run: | | |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ | |
-m "not largedist" \ | |
--durations=0 \ | |
--ignore tests/test_analyzer \ | |
--ignore tests/test_auto_parallel \ | |
--ignore tests/test_fx \ | |
--ignore tests/test_autochunk \ | |
--ignore tests/test_gptq \ | |
--ignore tests/test_infer_ops \ | |
--ignore tests/test_legacy \ | |
--ignore tests/test_smoothquant \ | |
tests/ | |
env: | |
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 | |
LLAMA_PATH: /data/scratch/llama-tiny | |
MOE_TENSOR_PATH: /data/scratch/moe_tensors | |
- name: Collate artifact | |
env: | |
PR_NUMBER: ${{ github.event.number }} | |
changedLibraryFiles: ${{ needs.detect.outputs.changedLibraryFiles }} | |
anyLibraryFileChanged: ${{ needs.detect.outputs.anyLibraryFileChanged }} | |
changedExtenisonFiles: ${{ needs.detect.outputs.changedExtenisonFiles }} | |
run: | | |
mkdir report | |
echo $PR_NUMBER > ./report/pr_number | |
# generate coverage.xml if any | |
if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then | |
allFiles="" | |
for file in $changedLibraryFiles; do | |
if [ "$allFiles" == "" ]; then | |
allFiles=$file | |
else | |
allFiles=$allFiles,$file | |
fi | |
done | |
coverage report --data-file .coverage --include $allFiles > ./coverage.txt | |
covPercentage=$(tail -n 1 coverage.txt | grep -o '[1-9]*%$') | |
covNum=${covPercentage::-1} | |
mv coverage.txt ./report | |
echo $covNum > ./report/cov_number | |
else | |
echo "No coverage report is generated" | |
fi | |
- name: Upload test coverage artifact | |
uses: actions/upload-artifact@v3 | |
with: | |
name: report | |
path: report/ |