From ed69bf5da0d85ba363e9985cd2333a404460c631 Mon Sep 17 00:00:00 2001 From: Stonepia Date: Thu, 4 Apr 2024 11:17:47 +0800 Subject: [PATCH] Add E2E CI (#582) * add e2e CI * modify the card id * solve the concurrency issue * add dependancy for torchbench * add dependancy for torchbench * modify the github.workspace path * activate oneAPI * git submodule update * change triton path * update triton build * update triton commit * modify test scope and yml name * change the install e2e suites path * change the install e2e suites path * change use path * change use path * change use path * addd bash for e2e install * addd bash for e2e install * addd bash for e2e install * add composite * modify the e2e running way * add inductor_xpu_test.sh * modify the E2E test sciripts --------- Co-authored-by: Zhong, Ruijie --- .../actions/inductor-xpu-e2e-test/action.yml | 104 ++++++++++++++++++ .github/ci_commit_pins/benchmark.txt | 1 + .github/ci_commit_pins/triton.txt | 1 + .github/scripts/inductor-xpu-e2e.sh | 61 ++++++++++ .github/scripts/inductor_xpu_test.sh | 58 ++++++++++ .github/scripts/install-e2e-suites/action.yml | 68 ++++++++++++ .github/scripts/results-check.sh | 17 +++ .github/workflows/inductor_xpu_e2e_ci.yml | 81 ++++++++++++++ 8 files changed, 391 insertions(+) create mode 100644 .github/actions/inductor-xpu-e2e-test/action.yml create mode 100644 .github/ci_commit_pins/benchmark.txt create mode 100644 .github/ci_commit_pins/triton.txt create mode 100644 .github/scripts/inductor-xpu-e2e.sh create mode 100755 .github/scripts/inductor_xpu_test.sh create mode 100644 .github/scripts/install-e2e-suites/action.yml create mode 100644 .github/scripts/results-check.sh create mode 100644 .github/workflows/inductor_xpu_e2e_ci.yml diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml new file mode 100644 index 000000000..6fe330fa7 --- /dev/null +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -0,0 +1,104 @@ +name: inductor-xpu-e2e-test + +inputs: + suite: + required: true + type: string + default: "huggingface" + description: Dynamo benchmarks test suite, huggingface / timm_models / torchbench + dt: + required: true + type: string + default: "float32" + description: Data precision of the test. float32 / bfloat16 / float16 / amp_fp16 / amp_bf16 + mode: + required: true + type: string + default: "inference" + description: inference / training test + scenario: + required: true + type: string + default: "accuracy" + description: accuracy / performance test + cards: + required: false + type: string + default: "all" + description: which cards can be used in the test + expected_pass_num: + required: false + type: number + description: for result check + +runs: + using: composite + steps: + - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash + run: | + source activate e2e_ci + cp .github/scripts/inductor_xpu_test.sh ../pytorch + cd ../pytorch + if [[ ${{ input.suite}} == "timm_models" ]]; then + pip install --no-deps "git+https://github.com/rwightman/pytorch-image-models@b9d43c7dcac1fe05e851dd7be7187b108af593d2" + elif [[ ${{ input.suite}} == "torchbench" ]]; then + pip install transformers==4.38.1 --no-deps + pip install timm==0.9.7 --no-deps + apt-get update -y + apt install libgl1-mesa-glx -y + conda install -y git-lfs pyyaml pandas scipy psutil + pip install tqdm pandas pyre-extensions torchrec tensorboardX dalle2_pytorch torch_geometric scikit-image matplotlib gym fastNLP doctr matplotlib opacus python-doctr higher opacus dominate kaldi-io librosa effdet pycocotools diffusers + pip uninstall -y pyarrow pandas + pip install pyarrow pandas + + cd .. + git clone https://github.com/facebookresearch/detectron2.git + python -m pip install -e detectron2 + + git clone --recursive https://github.com/facebookresearch/multimodal.git multimodal + pushd multimodal + pip install -e . + popd + fi + + #TRANSFORMERS_COMMIT=$(cat .ci/docker/ci_commit_pins/huggingface.txt) + #pip install --force-reinstall git+https://github.com/huggingface/transformers@${TRANSFORMERS_COMMIT} + source /opt/intel/oneapi/setvars.sh + #export PYTORCH_ENABLE_XPU_FALLBACK=1 + rm -rf inductor_log + bash inductor_xpu_test.sh ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }} xpu 3 + - name: Test Results Overview (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + shell: bash + run: | + set +e + cd ../pytorch/inductor_log/${{ inputs.suite }} + cd ${{ inputs.dt }} + echo -e "============ Summary for ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }} ============" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log + awk -i inplace '!seen[$0]++' inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv + csv_lines=$(cat inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv | wc -l) + let num_total=csv_lines-1 + num_passed=$(grep -c "pass" inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv) + let num_failed=num_total-num_passed + pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed'/'$num_total')*100}'` + echo "num_total: $num_total" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log + echo "num_passed: $num_passed" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log + echo "num_failed: $num_failed" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log + echo "pass_rate: $pass_rate" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log + cd ${{ github.workspace }} && cp -r ../pytorch/inductor_log . + - name: Upload Inductor XPU E2E CI Data (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + uses: actions/upload-artifact@v4 + with: + name: Inductor-XPU-E2E-CI-Data-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ github.event.pull_request.number || github.ref }} + path: ${{ github.workspace }}/../pytorch/inductor_log + - name: Test Results Check (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) + if: ${{ inputs.expected_pass_num }} + shell: bash + run: | + cd ../pytorch/inductor_log/${{ inputs.suite }} + cd ${{ inputs.dt }} + num_passed=$(grep "num_passed:" ${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//') + if [ $num_passed -lt ${{ inputs.expected_pass_num }} ]; then + echo -e "[ERROR] Inductor E2E CI test for ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} passed_num < ${{ inputs.expected_pass_num }}" + exit 1 + fi diff --git a/.github/ci_commit_pins/benchmark.txt b/.github/ci_commit_pins/benchmark.txt new file mode 100644 index 000000000..cd3a6f25f --- /dev/null +++ b/.github/ci_commit_pins/benchmark.txt @@ -0,0 +1 @@ +fb0dfed4c8c8ab1c9816b02832f7a99d86ee4ca5 diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt new file mode 100644 index 000000000..a8aee01df --- /dev/null +++ b/.github/ci_commit_pins/triton.txt @@ -0,0 +1 @@ +7f56d84c8e70e2aa802ceafb07b5890161508d81 diff --git a/.github/scripts/inductor-xpu-e2e.sh b/.github/scripts/inductor-xpu-e2e.sh new file mode 100644 index 000000000..91248f7b8 --- /dev/null +++ b/.github/scripts/inductor-xpu-e2e.sh @@ -0,0 +1,61 @@ +SUITE=${1:-huggingface} +DT=${2:-amp_bf16} +MODE=${3:-inference} +SCENARIO=${4:-accuracy} +expected_pass_num=${5:-46} + +echo -e "=========================================================================" +echo -e "Dependancy Install" +echo -e "=========================================================================" +source activate e2e_ci +cp .github/scripts/inductor_xpu_test.sh ../pytorch +cd ../pytorch +if [[ ${SUITE} == "timm_MODEls" ]]; then + pip install --no-deps "git+https://github.com/rwightman/pytorch-image-MODEls@b9d43c7dcac1fe05e851dd7be7187b108af593d2" +elif [[ ${SUITE} == "torchbench" ]]; then + pip install transformers==4.38.1 --no-deps + pip install timm==0.9.7 --no-deps + apt-get update -y + apt install libgl1-mesa-glx -y + conda install -y git-lfs pyyaml pandas scipy psutil + pip install tqdm pandas pyre-extensions torchrec tensorboardX dalle2_pytorch torch_geometric scikit-image matplotlib gym fastNLP doctr matplotlib opacus python-doctr higher opacus dominate kaldi-io librosa effdet pycocotools diffusers + pip uninstall -y pyarrow pandas + pip install pyarrow pandas + + cd .. + git clone https://github.com/facebookresearch/detectron2.git + python -m pip install -e detectron2 + + git clone --recursive https://github.com/facebookresearch/multimodal.git multimodal + pushd multimodal + pip install -e . + popd +fi + +#TRANSFORMERS_COMMIT=$(cat .ci/docker/ci_commit_pins/huggingface.txt) +#pip install --force-reinstall git+https://github.com/huggingface/transformers@${TRANSFORMERS_COMMIT} +echo -e "=========================================================================" +echo -e "E2E Test" +echo -e "=========================================================================" +source /opt/intel/oneapi/setvars.sh +export HUGGING_FACE_HUB_TOKEN=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 +#export PYTORCH_ENABLE_XPU_FALLBACK=1 +rm -rf inductor_log +bash inductor_xpu_test.sh ${SUITE} ${DT} ${MODE} ${SCENARIO} xpu 3 + +echo -e "=========================================================================" +echo -e "Test Results Summary" +echo -e "=========================================================================" +cd ../pytorch/inductor_log/${SUITE} +cd ${DT} +echo -e "============ Summary for ${SUITE} ${DT} ${MODE} ${SCENARIO} ============" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log +awk -i inplace '!seen[$0]++' inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv +csv_lines=$(cat inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv | wc -l) +let num_total=csv_lines-1 +num_passed=$(grep -c "pass" inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv) +let num_failed=num_total-num_passed +#pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed'/'$num_total')*100}'` +echo "num_total: $num_total" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log +echo "num_passed: $num_passed" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log +echo "num_failed: $num_failed" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log +#echo "pass_rate: $pass_rate" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log diff --git a/.github/scripts/inductor_xpu_test.sh b/.github/scripts/inductor_xpu_test.sh new file mode 100755 index 000000000..d49dc21bd --- /dev/null +++ b/.github/scripts/inductor_xpu_test.sh @@ -0,0 +1,58 @@ +#! /bin/bash +# This script work for xpu / cuda device inductor tests + +SUITE=${1:-huggingface} # huggingface / torchbench / timm_models +DT=${2:-float32} # float32 / float16 / amp_bf16 / amp_fp16 +MODE=${3:-inference} # inference / training +SCENARIO=${4:-accuracy} # accuracy / performance +DEVICE=${5:-xpu} # xpu / cuda +CARD=${6:-0} # 0 / 1 / 2 / 3 ... +SHAPE=${7:-static} # static / dynamic +NUM_SHARDS=${8} # num test shards +SHARD_ID=${9} # shard id +MODEL_ONLY=${10} # GoogleFnet / T5Small + +WORKSPACE=`pwd` +LOG_DIR=$WORKSPACE/inductor_log/${SUITE}/${DT} +mkdir -p ${LOG_DIR} +LOG_NAME=inductor_${SUITE}_${DT}_${MODE}_${DEVICE}_${SCENARIO} + +Model_only_extra="" +if [[ -n "$MODEL_ONLY" ]]; then + echo "Testing model ${MODEL_ONLY}" + Model_only_extra="--only ${MODEL_ONLY}" +fi + + +Cur_Ver=`pip list | grep "^torch " | awk '{print $2}' | cut -d"+" -f 1` +if [ $(printf "${Cur_Ver}\n2.0.2"|sort|head -1) = "${Cur_Ver}" ]; then + Mode_extra=""; +else + # For PT 2.1 + Mode_extra="--inference --freezing "; +fi +if [[ $MODE == "training" ]]; then + echo "Testing with training mode." + Mode_extra="--training " +fi + +Shape_extra="" +if [[ $SHAPE == "dynamic" ]]; then + echo "Testing with dynamic shapes." + Shape_extra="--dynamic-shapes --dynamic-batch-only " +fi + +partition_flags="" +if [[ -n "$NUM_SHARDS" && -n "$SHARD_ID" ]]; then + partition_flags="--total-partitions $NUM_SHARDS --partition-id $SHARD_ID " +fi + +ulimit -n 1048576 +if [[ $DT == "amp_bf16" ]]; then + ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --amp -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra} --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log +elif [[ $DT == "amp_fp16" ]]; then + export INDUCTOR_AMP_DT=float16 + ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --amp -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra} --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log +else + ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --${DT} -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra} --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log +fi diff --git a/.github/scripts/install-e2e-suites/action.yml b/.github/scripts/install-e2e-suites/action.yml new file mode 100644 index 000000000..8d0c389fe --- /dev/null +++ b/.github/scripts/install-e2e-suites/action.yml @@ -0,0 +1,68 @@ +name: install-e2e-suites + +runs: + using: composite + steps: + - name: Torchvision Install + shell: bash + run: | + source activate e2e_ci + cd ../pytorch + TORCH_VISION_PIN_COMMIT=$(cat .github/ci_commit_pins/vision.txt) + cd .. + if [ ! -d "vision" ]; then + git clone --recursive https://github.com/pytorch/vision.git + fi + cd vision + git checkout ${TORCH_VISION_PIN_COMMIT} + conda install -y libpng jpeg + # TODO: We use an older version ffmpeg to avoid the vision capability issue. + conda install -y -c conda-forge 'ffmpeg<4.4' + python setup.py install + cd .. + - name: Torchtext Install + shell: bash + run: | + source activate e2e_ci + cd ../pytorch + TORCH_TEXT_PIN_COMMIT=$(cat .github/ci_commit_pins/text.txt) + cd .. + if [ ! -d "text" ]; then + git clone --recursive https://github.com/pytorch/text.git + fi + # Torchtext + cd text + git checkout ${TORCH_TEXT_PIN_COMMIT} + python setup.py clean install + cd .. + - name: Torchaudio Install + shell: bash + run: | + source activate e2e_ci + cd ../pytorch + TORCH_AUDIO_PIN_COMMIT=$(cat .github/ci_commit_pins/audio.txt) + cd .. + # Torch audio + if [ ! -d "audio" ]; then + git clone --recursive https://github.com/pytorch/audio.git + fi + cd audio + # Optionally `git checkout {pinned_commit}` + # git checkout ${TORCH_AUDIO_PIN_COMMIT} break in pinned_commit + python setup.py install + cd .. + - name: Benchmark Install + shell: bash + run: | + source activate e2e_ci + BENCHMARK_PINNED_COMMIT=$(cat .github/ci_commit_pins/benchmark.txt) + cd .. + if [ ! -d "benchmark" ]; then + git clone --recursive https://github.com/weishi-deng/benchmark + fi + cd benchmark + git checkout ${BENCHMARK_PINNED_COMMIT} + python install.py + pip install -e . + + diff --git a/.github/scripts/results-check.sh b/.github/scripts/results-check.sh new file mode 100644 index 000000000..08353e689 --- /dev/null +++ b/.github/scripts/results-check.sh @@ -0,0 +1,17 @@ +SUITE=${1:-huggingface} +DT=${2:-amp_bf16} +MODE=${3:-inference} +SCENARIO=${4:-accuracy} +expected_pass_num=${5:-46} + +echo -e "=========================================================================" +echo -e "Results Check" +echo -e "=========================================================================" + +cd ../pytorch/inductor_log/${SUITE} +cd ${DT} +num_passed=$(grep "num_passed:" ${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//') +if [ $num_passed -lt ${expected_pass_num} ]; then + echo -e "[ERROR] Inductor E2E CI test for ${SUITE} ${DT} ${MODE} passed_num < ${expected_pass_num}" + exit 1 +fi diff --git a/.github/workflows/inductor_xpu_e2e_ci.yml b/.github/workflows/inductor_xpu_e2e_ci.yml new file mode 100644 index 000000000..9025eb345 --- /dev/null +++ b/.github/workflows/inductor_xpu_e2e_ci.yml @@ -0,0 +1,81 @@ +name: E2E CI Tests + +on: + workflow_dispatch: + pull_request: + branches: [dev/triton-test-3.0] + merge_group: + branches: [dev/triton-test-3.0] + types: [checks_requested] + +permissions: read-all + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + Inductor-XPU-E2E-CI-Tests: + runs-on: [self-hosted, Inductor_test] + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Prepare Conda ENV + run: | + which conda + conda remove -yn e2e_ci --all + if conda env list | grep -q "^e2e_ci "; then source activate e2e_ci; else conda create -n e2e_ci python=3.9 cmake ninja -y; fi + conda install intel::mkl-static intel::mkl-include -y + pip install pandas scipy tqdm + - name: Prepare Pytorch + run: | + source activate e2e_ci + pwd + cd ../ && rm -rf pytorch + git clone -b dev/triton-test-3.0 https://github.com/Stonepia/pytorch.git pytorch + cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive + conda install -c conda-forge libstdcxx-ng -y + pip install pyyaml + pip install -r requirements.txt + python setup.py develop + + - name: Prepare IPEX + run: | + source activate e2e_ci + source /opt/intel/oneapi/setvars.sh + python -c "import torch;print(f'torch version {torch.__version__}')" + python -m pip uninstall intel_extension_for_pytorch -y + export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + cd ${{ github.workspace }} + git submodule sync && git submodule update --init --recursive --jobs 0 + python -m pip install -r requirements.txt + python setup.py bdist_wheel + pip install --force-reinstall dist/*.whl + + - name: Triton Installation + run: | + source activate e2e_ci + TRITON_PINNED_COMMIT=$(cat .github/ci_commit_pins/triton.txt) + echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT} + cd ${{ github.workspace }} + cd .. + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT} + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python" + + - name: E2E dependancy install + uses: ./.github/scripts/install-e2e-suites + + - name: Huggingface AMP_BF16 Inference Accuracy Test + run: | + bash .github/scripts/inductor-xpu-e2e.sh + + - name: Upload Triton Inductor E2E CI Data + uses: actions/upload-artifact@v3 + with: + name: Triton-Inductor-E2E-CI-Data + path: /home/ipex/actions-runner/_work/intel-extension-for-pytorch/pytorch/inductor_log/ + + - name: Test Result Check + run: | + bash .github/scripts/results-check.sh