[Feature] The first PR to Add TP inference engine, kv-cache manager and related kernels for our inference system #3132
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Build on PR | |
on: | |
pull_request: | |
types: [synchronize, opened, reopened, ready_for_review, closed, edited] | |
branches: | |
- "main" | |
- "develop" | |
- "feature/**" | |
paths: | |
- ".github/workflows/build_on_pr.yml" # run command & env variables change | |
- "colossalai/**" # source code change | |
- "!colossalai/**.md" # ignore doc change | |
- "op_builder/**" # cuda extension change | |
- "!op_builder/**.md" # ignore doc change | |
- "requirements/**" # requirements change | |
- "tests/**" # test change | |
- "!tests/**.md" # ignore doc change | |
- "pytest.ini" # test config change | |
- "setup.py" # install command change | |
create: | |
delete: | |
jobs: | |
prepare_cache: | |
name: Prepare testmon cache | |
if: | | |
github.event_name == 'create' && | |
github.event.ref_type == 'branch' && | |
github.event.repository.full_name == 'hpcaitech/ColossalAI' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 | |
options: --rm | |
timeout-minutes: 5 | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- name: Copy testmon cache | |
run: | # branch name may contain slash, we need to replace it with space | |
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /") | |
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then | |
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}" | |
fi | |
env: | |
MAIN_BRANCH: ${{ github.event.master_branch }} | |
prepare_cache_for_pr: | |
name: Prepare testmon cache for PR | |
if: | | |
github.event_name == 'pull_request' && | |
(github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) && | |
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 | |
options: --rm | |
timeout-minutes: 5 | |
defaults: | |
run: | |
shell: bash | |
concurrency: | |
group: ${{ github.head_ref }} | |
cancel-in-progress: false | |
steps: | |
- name: Copy testmon cache | |
run: | # branch name may contain slash, we need to replace it with space | |
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") | |
if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then | |
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER} | |
fi | |
env: | |
PR_NUMBER: ${{ github.event.number }} | |
detect: | |
name: Detect file change | |
if: | | |
github.event_name == 'pull_request' && | |
(github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') && | |
github.event.pull_request.draft == false && | |
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' | |
outputs: | |
changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }} | |
anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} | |
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} | |
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} | |
runs-on: ubuntu-latest | |
concurrency: | |
group: ${{ github.head_ref }} | |
cancel-in-progress: false | |
steps: | |
- uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
ref: ${{ github.event.pull_request.head.sha }} | |
- name: Locate base commit | |
id: locate-base-sha | |
run: | | |
curBranch=$(git rev-parse --abbrev-ref HEAD) | |
commonCommit=$(git merge-base origin/main $curBranch) | |
echo $commonCommit | |
echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT | |
- name: Find the changed extension-related files | |
id: find-extension-change | |
uses: tj-actions/changed-files@v35 | |
with: | |
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} | |
files: | | |
op_builder/** | |
colossalai/kernel/** | |
setup.py | |
- name: Find the changed library-related files | |
id: find-lib-change | |
uses: tj-actions/changed-files@v35 | |
with: | |
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} | |
files: | | |
**/*.py | |
**/*.h | |
**/*.cpp | |
**/*.cu | |
**/*.txt | |
- name: List changed files | |
run: | | |
for file in ${{ steps.find-extension-change.outputs.all_changed_files }}; do | |
echo "$file was changed" | |
done | |
for file in ${{ steps.find-lib-change.outputs.all_changed_files }}; do | |
echo "$file was changed" | |
done | |
build: | |
name: Build and Test Colossal-AI | |
needs: detect | |
if: needs.detect.outputs.anyLibraryFileChanged == 'true' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 | |
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 | |
timeout-minutes: 60 | |
defaults: | |
run: | |
shell: bash | |
concurrency: | |
group: ${{ github.head_ref }} | |
cancel-in-progress: false | |
steps: | |
- name: Checkout TensorNVMe | |
uses: actions/checkout@v2 | |
with: | |
repository: hpcaitech/TensorNVMe | |
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} | |
path: TensorNVMe | |
- name: Restore TensorNVMe Cache | |
run: | | |
if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then | |
cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe | |
fi | |
- name: Install TensorNVMe | |
run: | | |
cd TensorNVMe | |
conda install cmake | |
pip install -r requirements.txt | |
pip install -v . | |
- name: Store TensorNVMe Cache | |
run: | | |
cd TensorNVMe | |
cp -p -r ./build /github/home/tensornvme_cache/ | |
- name: Checkout Colossal-AI | |
uses: actions/checkout@v2 | |
with: | |
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} | |
- name: Restore Colossal-AI Cache | |
if: needs.detect.outputs.anyExtensionFileChanged != 'true' | |
run: | | |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild | |
if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then | |
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ | |
fi | |
- name: Install Colossal-AI | |
run: | | |
CUDA_EXT=1 pip install -v -e . | |
pip install -r requirements/requirements-test.txt | |
- name: Store Colossal-AI Cache | |
run: | | |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild | |
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ | |
- name: Restore Testmon Cache | |
run: | | |
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then | |
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/ | |
fi | |
env: | |
PR_NUMBER: ${{ github.event.number }} | |
- name: Execute Unit Testing | |
run: | | |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/ | |
env: | |
DATA: /data/scratch/cifar-10 | |
NCCL_SHM_DISABLE: 1 | |
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 | |
TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt | |
- name: Store Testmon Cache | |
run: | | |
mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} | |
cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/ | |
env: | |
PR_NUMBER: ${{ github.event.number }} | |
- name: Collate artifact | |
env: | |
PR_NUMBER: ${{ github.event.number }} | |
changedLibraryFiles: ${{ needs.detect.outputs.changedLibraryFiles }} | |
anyLibraryFileChanged: ${{ needs.detect.outputs.anyLibraryFileChanged }} | |
changedExtenisonFiles: ${{ needs.detect.outputs.changedExtenisonFiles }} | |
run: | | |
mkdir report | |
echo $PR_NUMBER > ./report/pr_number | |
# generate coverage.xml if any | |
if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then | |
allFiles="" | |
for file in $changedLibraryFiles; do | |
if [ "$allFiles" == "" ]; then | |
allFiles=$file | |
else | |
allFiles=$allFiles,$file | |
fi | |
done | |
coverage report --data-file .coverage --include $allFiles > ./coverage.txt | |
covPercentage=$(tail -n 1 coverage.txt | grep -o '[1-9]*%$') | |
covNum=${covPercentage::-1} | |
mv coverage.txt ./report | |
echo $covNum > ./report/cov_number | |
else | |
echo "No coverage report is generated" | |
fi | |
- name: Upload test coverage artifact | |
uses: actions/upload-artifact@v3 | |
with: | |
name: report | |
path: report/ | |
store_cache: | |
name: Store testmon cache for PR | |
if: | | |
github.event_name == 'pull_request' && | |
github.event.action == 'closed' && | |
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 | |
options: --rm | |
timeout-minutes: 5 | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- name: Store testmon cache if possible | |
if: github.event.pull_request.merged == true | |
run: | # branch name may contain slash, we need to replace it with space | |
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") | |
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then | |
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/" | |
fi | |
env: | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
- name: Remove testmon cache | |
run: | | |
rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER} | |
env: | |
PR_NUMBER: ${{ github.event.pull_request.number }} | |
remove_cache: | |
name: Remove testmon cache | |
if: | | |
github.event_name == 'delete' && | |
github.event.ref_type == 'branch' && | |
github.event.repository.full_name == 'hpcaitech/ColossalAI' | |
runs-on: [self-hosted, gpu] | |
container: | |
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 | |
options: --rm | |
timeout-minutes: 5 | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- name: Remove testmon cache | |
run: | # branch name may contain slash, we need to replace it with space | |
export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /") | |
rm -rf "/github/home/testmon_cache/${BASE}" |