Add E2E CI (#582)

* add e2e CI * modify the card id * solve the concurrency issue * add dependancy for torchbench * add dependancy for torchbench * modify the github.workspace path * activate oneAPI * git submodule update * change triton path * update triton build * update triton commit * modify test scope and yml name * change the install e2e suites path * change the install e2e suites path * change use path * change use path * change use path * addd bash for e2e install * addd bash for e2e install * addd bash for e2e install * add composite * modify the e2e running way * add inductor_xpu_test.sh * modify the E2E test sciripts --------- Co-authored-by: Zhong, Ruijie <[email protected]>
intel · Apr 4, 2024 · ed69bf5 · ed69bf5
1 parent 89eff3a
commit ed69bf5
Show file tree

Hide file tree

Showing 8 changed files with 391 additions and 0 deletions.
diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -0,0 +1,104 @@
+name: inductor-xpu-e2e-test
+
+inputs:
+  suite:
+    required: true
+    type: string
+    default: "huggingface"
+    description: Dynamo benchmarks test suite, huggingface / timm_models / torchbench
+  dt:
+    required: true
+    type: string
+    default: "float32"
+    description: Data precision of the test. float32 / bfloat16 / float16 / amp_fp16 / amp_bf16
+  mode:
+    required: true
+    type: string
+    default: "inference"
+    description: inference / training test
+  scenario:
+    required: true
+    type: string
+    default: "accuracy"
+    description: accuracy / performance test
+  cards:
+    required: false
+    type: string
+    default: "all"
+    description: which cards can be used in the test
+  expected_pass_num:
+    required: false
+    type: number
+    description: for result check
+
+runs:
+  using: composite
+  steps:
+    - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      shell: bash
+      run: |
+        source activate e2e_ci
+        cp .github/scripts/inductor_xpu_test.sh ../pytorch
+        cd ../pytorch
+        if [[ ${{ input.suite}} == "timm_models" ]]; then
+            pip install --no-deps "git+https://github.com/rwightman/pytorch-image-models@b9d43c7dcac1fe05e851dd7be7187b108af593d2"
+        elif [[ ${{ input.suite}} == "torchbench" ]]; then
+            pip install transformers==4.38.1 --no-deps
+            pip install timm==0.9.7 --no-deps
+            apt-get update -y
+            apt install libgl1-mesa-glx -y
+            conda install -y git-lfs pyyaml pandas scipy psutil
+            pip install tqdm pandas pyre-extensions torchrec tensorboardX dalle2_pytorch torch_geometric scikit-image matplotlib  gym fastNLP doctr matplotlib opacus python-doctr higher opacus dominate kaldi-io librosa effdet pycocotools diffusers
+            pip uninstall -y pyarrow pandas
+            pip install pyarrow pandas
+            
+            cd ..
+            git clone https://github.com/facebookresearch/detectron2.git
+            python -m pip install -e detectron2
+
+            git clone --recursive https://github.com/facebookresearch/multimodal.git multimodal
+            pushd multimodal
+            pip install -e .
+            popd
+        fi
+
+        #TRANSFORMERS_COMMIT=$(cat .ci/docker/ci_commit_pins/huggingface.txt)
+        #pip install --force-reinstall git+https://github.com/huggingface/transformers@${TRANSFORMERS_COMMIT}
+        source /opt/intel/oneapi/setvars.sh
+        #export PYTORCH_ENABLE_XPU_FALLBACK=1
+        rm -rf inductor_log
+        bash inductor_xpu_test.sh ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }} xpu 3
+    - name: Test Results Overview (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      shell: bash
+      run: |
+        set +e
+        cd ../pytorch/inductor_log/${{ inputs.suite }}
+        cd ${{ inputs.dt }}
+        echo -e "============ Summary for ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }} ============" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log
+        awk -i inplace '!seen[$0]++' inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv
+        csv_lines=$(cat inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv | wc -l)
+        let num_total=csv_lines-1
+        num_passed=$(grep -c "pass" inductor_${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_xpu_${{ inputs.scenario }}.csv)
+        let num_failed=num_total-num_passed
+        pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed'/'$num_total')*100}'`
+        echo "num_total: $num_total" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log
+        echo "num_passed: $num_passed" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log
+        echo "num_failed: $num_failed" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log
+        echo "pass_rate: $pass_rate" | tee -a ./${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log
+        cd ${{ github.workspace }} && cp -r ../pytorch/inductor_log .
+    - name: Upload Inductor XPU E2E CI Data (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      uses: actions/upload-artifact@v4
+      with:
+        name: Inductor-XPU-E2E-CI-Data-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ github.event.pull_request.number || github.ref }}
+        path: ${{ github.workspace }}/../pytorch/inductor_log
+    - name: Test Results Check (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
+      if: ${{ inputs.expected_pass_num }}
+      shell: bash
+      run: |
+        cd ../pytorch/inductor_log/${{ inputs.suite }}
+        cd ${{ inputs.dt }}
+        num_passed=$(grep "num_passed:" ${{ inputs.suite }}_${{ inputs.dt }}_${{ inputs.mode }}_${{ inputs.scenario }}_e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
+        if [ $num_passed -lt ${{ inputs.expected_pass_num }} ]; then
+          echo -e "[ERROR] Inductor E2E CI test for ${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} passed_num < ${{ inputs.expected_pass_num }}"
+          exit 1
+        fi
diff --git a/.github/ci_commit_pins/benchmark.txt b/.github/ci_commit_pins/benchmark.txt
@@ -0,0 +1 @@
+fb0dfed4c8c8ab1c9816b02832f7a99d86ee4ca5
diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
@@ -0,0 +1 @@
+7f56d84c8e70e2aa802ceafb07b5890161508d81
diff --git a/.github/scripts/inductor-xpu-e2e.sh b/.github/scripts/inductor-xpu-e2e.sh
@@ -0,0 +1,61 @@
+SUITE=${1:-huggingface}
+DT=${2:-amp_bf16}
+MODE=${3:-inference}
+SCENARIO=${4:-accuracy}
+expected_pass_num=${5:-46}
+
+echo -e "========================================================================="
+echo -e "Dependancy Install"
+echo -e "========================================================================="
+source activate e2e_ci
+cp .github/scripts/inductor_xpu_test.sh ../pytorch
+cd ../pytorch
+if [[ ${SUITE} == "timm_MODEls" ]]; then
+    pip install --no-deps "git+https://github.com/rwightman/pytorch-image-MODEls@b9d43c7dcac1fe05e851dd7be7187b108af593d2"
+elif [[ ${SUITE} == "torchbench" ]]; then
+    pip install transformers==4.38.1 --no-deps
+    pip install timm==0.9.7 --no-deps
+    apt-get update -y
+    apt install libgl1-mesa-glx -y
+    conda install -y git-lfs pyyaml pandas scipy psutil
+    pip install tqdm pandas pyre-extensions torchrec tensorboardX dalle2_pytorch torch_geometric scikit-image matplotlib  gym fastNLP doctr matplotlib opacus python-doctr higher opacus dominate kaldi-io librosa effdet pycocotools diffusers
+    pip uninstall -y pyarrow pandas
+    pip install pyarrow pandas
+
+    cd ..
+    git clone https://github.com/facebookresearch/detectron2.git
+    python -m pip install -e detectron2
+
+    git clone --recursive https://github.com/facebookresearch/multimodal.git multimodal
+    pushd multimodal
+    pip install -e .
+    popd
+fi
+
+#TRANSFORMERS_COMMIT=$(cat .ci/docker/ci_commit_pins/huggingface.txt)
+#pip install --force-reinstall git+https://github.com/huggingface/transformers@${TRANSFORMERS_COMMIT}
+echo -e "========================================================================="
+echo -e "E2E Test"
+echo -e "========================================================================="
+source /opt/intel/oneapi/setvars.sh
+export HUGGING_FACE_HUB_TOKEN=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
+#export PYTORCH_ENABLE_XPU_FALLBACK=1
+rm -rf inductor_log
+bash inductor_xpu_test.sh ${SUITE} ${DT} ${MODE} ${SCENARIO} xpu 3
+
+echo -e "========================================================================="
+echo -e "Test Results Summary"
+echo -e "========================================================================="
+cd ../pytorch/inductor_log/${SUITE}
+cd ${DT}
+echo -e "============ Summary for ${SUITE} ${DT} ${MODE} ${SCENARIO} ============" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log
+awk -i inplace '!seen[$0]++' inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv
+csv_lines=$(cat inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv | wc -l)
+let num_total=csv_lines-1
+num_passed=$(grep -c "pass" inductor_${SUITE}_${DT}_${MODE}_xpu_${SCENARIO}.csv)
+let num_failed=num_total-num_passed
+#pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed'/'$num_total')*100}'`
+echo "num_total: $num_total" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log
+echo "num_passed: $num_passed" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log
+echo "num_failed: $num_failed" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log
+#echo "pass_rate: $pass_rate" | tee -a ./${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log
diff --git a/.github/scripts/inductor_xpu_test.sh b/.github/scripts/inductor_xpu_test.sh
@@ -0,0 +1,58 @@
+#! /bin/bash
+# This script work for xpu / cuda device inductor tests
+
+SUITE=${1:-huggingface}     # huggingface / torchbench / timm_models
+DT=${2:-float32}            # float32 / float16 / amp_bf16 / amp_fp16
+MODE=${3:-inference}        # inference / training
+SCENARIO=${4:-accuracy}     # accuracy / performance
+DEVICE=${5:-xpu}            # xpu / cuda
+CARD=${6:-0}                # 0 / 1 / 2 / 3 ...
+SHAPE=${7:-static}          # static / dynamic
+NUM_SHARDS=${8}             # num test shards
+SHARD_ID=${9}               # shard id
+MODEL_ONLY=${10}            # GoogleFnet / T5Small
+
+WORKSPACE=`pwd`
+LOG_DIR=$WORKSPACE/inductor_log/${SUITE}/${DT}
+mkdir -p ${LOG_DIR}
+LOG_NAME=inductor_${SUITE}_${DT}_${MODE}_${DEVICE}_${SCENARIO}
+
+Model_only_extra=""
+if [[ -n "$MODEL_ONLY" ]]; then
+    echo "Testing model ${MODEL_ONLY}"
+    Model_only_extra="--only ${MODEL_ONLY}"
+fi
+
+
+Cur_Ver=`pip list | grep "^torch " | awk '{print $2}' | cut -d"+" -f 1`
+if [ $(printf "${Cur_Ver}\n2.0.2"|sort|head -1) = "${Cur_Ver}" ]; then
+    Mode_extra="";
+else
+    # For PT 2.1
+    Mode_extra="--inference --freezing ";
+fi
+if [[ $MODE == "training" ]]; then
+    echo "Testing with training mode."
+    Mode_extra="--training "
+fi
+
+Shape_extra=""
+if [[ $SHAPE == "dynamic" ]]; then
+    echo "Testing with dynamic shapes."
+    Shape_extra="--dynamic-shapes --dynamic-batch-only "
+fi
+
+partition_flags=""
+if [[ -n "$NUM_SHARDS" && -n "$SHARD_ID" ]]; then
+  partition_flags="--total-partitions $NUM_SHARDS --partition-id $SHARD_ID "
+fi
+
+ulimit -n 1048576
+if [[ $DT == "amp_bf16" ]]; then
+    ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --amp -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra} --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log
+elif [[ $DT == "amp_fp16" ]]; then
+    export INDUCTOR_AMP_DT=float16
+    ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --amp -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra}  --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log
+else
+    ZE_AFFINITY_MASK=${CARD} python benchmarks/dynamo/${SUITE}.py --${SCENARIO} --${DT} -d${DEVICE} -n10 --no-skip --dashboard ${Mode_extra} ${Shape_extra} ${partition_flags} ${Model_only_extra}  --backend=inductor --timeout=4800 --output=${LOG_DIR}/${LOG_NAME}.csv 2>&1 | tee ${LOG_DIR}/${LOG_NAME}.log
+fi
diff --git a/.github/scripts/install-e2e-suites/action.yml b/.github/scripts/install-e2e-suites/action.yml
@@ -0,0 +1,68 @@
+name: install-e2e-suites
+
+runs:
+    using: composite
+    steps:
+      - name: Torchvision Install
+        shell: bash
+        run: |
+            source activate e2e_ci
+            cd ../pytorch
+            TORCH_VISION_PIN_COMMIT=$(cat .github/ci_commit_pins/vision.txt)
+            cd ..
+            if [ ! -d "vision" ]; then
+                git clone --recursive https://github.com/pytorch/vision.git
+            fi
+            cd vision
+            git checkout ${TORCH_VISION_PIN_COMMIT}
+            conda install -y libpng jpeg
+            # TODO: We use an older version ffmpeg to avoid the vision capability issue.
+            conda install -y -c conda-forge 'ffmpeg<4.4'
+            python setup.py install
+            cd ..
+      - name: Torchtext Install
+        shell: bash
+        run: |
+            source activate e2e_ci
+            cd ../pytorch
+            TORCH_TEXT_PIN_COMMIT=$(cat .github/ci_commit_pins/text.txt)
+            cd ..
+            if [ ! -d "text" ]; then
+                git clone --recursive https://github.com/pytorch/text.git
+            fi
+            # Torchtext
+            cd text
+            git checkout ${TORCH_TEXT_PIN_COMMIT}
+            python setup.py clean install
+            cd ..
+      - name: Torchaudio Install
+        shell: bash
+        run: |
+            source activate e2e_ci
+            cd ../pytorch
+            TORCH_AUDIO_PIN_COMMIT=$(cat .github/ci_commit_pins/audio.txt)
+            cd ..
+            # Torch audio
+            if [ ! -d "audio" ]; then
+                git clone --recursive https://github.com/pytorch/audio.git
+            fi
+            cd audio
+            # Optionally `git checkout {pinned_commit}`
+            # git checkout ${TORCH_AUDIO_PIN_COMMIT} break in pinned_commit
+            python setup.py install
+            cd ..
+      - name: Benchmark Install
+        shell: bash
+        run: |
+            source activate e2e_ci
+            BENCHMARK_PINNED_COMMIT=$(cat .github/ci_commit_pins/benchmark.txt)
+            cd ..
+            if [ ! -d "benchmark" ]; then
+                git clone --recursive https://github.com/weishi-deng/benchmark
+            fi
+            cd benchmark
+            git checkout ${BENCHMARK_PINNED_COMMIT}
+            python install.py
+            pip install -e .
+
+
diff --git a/.github/scripts/results-check.sh b/.github/scripts/results-check.sh
@@ -0,0 +1,17 @@
+SUITE=${1:-huggingface}
+DT=${2:-amp_bf16}
+MODE=${3:-inference}
+SCENARIO=${4:-accuracy}
+expected_pass_num=${5:-46}
+
+echo -e "========================================================================="
+echo -e "Results Check"
+echo -e "========================================================================="
+
+cd ../pytorch/inductor_log/${SUITE}
+cd ${DT}
+num_passed=$(grep "num_passed:" ${SUITE}_${DT}_${MODE}_${SCENARIO}_e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
+if [ $num_passed -lt ${expected_pass_num} ]; then
+  echo -e "[ERROR] Inductor E2E CI test for ${SUITE} ${DT} ${MODE} passed_num < ${expected_pass_num}"
+  exit 1
+fi
diff --git a/.github/workflows/inductor_xpu_e2e_ci.yml b/.github/workflows/inductor_xpu_e2e_ci.yml
@@ -0,0 +1,81 @@
+name: E2E CI Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [dev/triton-test-3.0]
+  merge_group:
+    branches: [dev/triton-test-3.0]
+    types: [checks_requested]
+
+permissions: read-all
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  Inductor-XPU-E2E-CI-Tests:
+    runs-on: [self-hosted, Inductor_test]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Prepare Conda ENV
+        run: |
+          which conda
+          conda remove -yn e2e_ci --all
+          if conda env list | grep -q "^e2e_ci "; then source activate e2e_ci; else conda create -n e2e_ci python=3.9 cmake ninja -y; fi
+          conda install intel::mkl-static intel::mkl-include -y
+          pip install pandas scipy tqdm
+      - name: Prepare Pytorch
+        run: |
+          source activate e2e_ci
+          pwd
+          cd ../ && rm -rf pytorch
+          git clone -b dev/triton-test-3.0 https://github.com/Stonepia/pytorch.git pytorch
+          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
+          conda install -c conda-forge libstdcxx-ng -y
+          pip install pyyaml
+          pip install -r requirements.txt
+          python setup.py develop
+
+      - name: Prepare IPEX
+        run: |
+          source activate e2e_ci
+          source /opt/intel/oneapi/setvars.sh
+          python -c "import torch;print(f'torch version {torch.__version__}')"
+          python -m pip uninstall intel_extension_for_pytorch -y
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          cd ${{ github.workspace }}
+          git submodule sync && git submodule update --init --recursive --jobs 0
+          python -m pip install -r requirements.txt
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+
+      - name: Triton Installation
+        run: |
+          source activate e2e_ci
+          TRITON_PINNED_COMMIT=$(cat .github/ci_commit_pins/triton.txt)
+          echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
+          cd ${{ github.workspace }} 
+          cd ..
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
+          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+
+      - name: E2E dependancy install
+        uses: ./.github/scripts/install-e2e-suites
+
+      - name: Huggingface AMP_BF16 Inference Accuracy Test
+        run: |
+          bash .github/scripts/inductor-xpu-e2e.sh
+      
+      - name: Upload Triton Inductor E2E CI Data
+        uses: actions/upload-artifact@v3
+        with:
+          name: Triton-Inductor-E2E-CI-Data
+          path: /home/ipex/actions-runner/_work/intel-extension-for-pytorch/pytorch/inductor_log/
+
+      - name: Test Result Check
+        run: |
+          bash .github/scripts/results-check.sh