From 1b68776e24ed90a5b48edbfca1e85b06128ab625 Mon Sep 17 00:00:00 2001 From: Yeounoh Chung Date: Tue, 26 Sep 2023 16:24:36 -0700 Subject: [PATCH] CI hotfix - Do not parallelize coverage runs (#5648) * Do not parallelize coverage runs * Enable coverage only for master push & pr --- .github/workflows/_coverage.yml | 135 +++++++++++++++++++++++++++ .github/workflows/build_and_test.yml | 4 +- 2 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/_coverage.yml diff --git a/.github/workflows/_coverage.yml b/.github/workflows/_coverage.yml new file mode 100644 index 00000000000..4643e225314 --- /dev/null +++ b/.github/workflows/_coverage.yml @@ -0,0 +1,135 @@ +name: xla-test-coverage +on: + workflow_call: + inputs: + docker-image: + required: true + type: string + description: Image to test on + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + collect-coverage: + required: false + type: boolean + description: Set to true to collect coverage information + default: false + timeout-minutes: + required: false + type: number + default: 270 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + disable-pjrt: + required: false + type: string + default: 0 + description: Whether to disable PJRT tests + disable-xrt: + required: false + type: string + default: 0 + description: Whether to disable XRT tests + test-script: + required: false + type: string + default: test.sh + description: Which test script to run + + secrets: + gcloud-service-key: + required: true + description: Secret to access Bazel build cache +jobs: + test: + runs-on: ${{ inputs.runner }} + timeout-minutes: ${{ inputs.timeout-minutes }} + env: + DOCKER_IMAGE: ${{ inputs.docker-image }} + WORKDIR: /var/lib/jenkins/workspace + GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} + USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + XLA_SKIP_XRT_TESTS: ${{ inputs.disable-xrt }} + XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }} + XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }} + steps: + - name: Setup Linux + uses: pytorch/test-infra/.github/actions/setup-linux@main + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + Tests are done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + - name: Install gcloud CLI + if: ${{ inputs.collect-coverage }} + shell: bash + run: | + sudo tee -a /etc/yum.repos.d/google-cloud-sdk.repo << EOM + [google-cloud-cli] + name=Google Cloud CLI + baseurl=https://packages.cloud.google.com/yum/repos/cloud-sdk-el8-x86_64 + enabled=1 + gpgcheck=1 + repo_gpgcheck=0 + gpgkey=https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg + EOM + sudo yum install -y google-cloud-cli + - name: Auth to GCR + if: ${{ inputs.collect-coverage }} + shell: bash + run: | + echo "${GCLOUD_SERVICE_KEY}" | gcloud auth activate-service-account --key-file=- + - name: Download and run docker image from GCR + shell: bash + run: | + echo "DOCKER_IMAGE: ${DOCKER_IMAGE}" + docker pull "${DOCKER_IMAGE}" + pid=$(docker run --shm-size=16g ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}") + echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json" + echo "pid=${pid}" >> "${GITHUB_ENV}" + - name: Test + shell: bash + run: | + docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}' + - name: Upload coverage results + if: ${{ inputs.collect-coverage }} + shell: bash + env: + CIRCLE_WORKFLOW_ID: ${{ github.run_id }} + CIRCLE_BUILD_NUM: ${{ github.run_number }} + run: | + docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" + if [ -n "${GPU_FLAG:-}" ]; then + # Python + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage.out + + # CPP + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage.out + else + # Python + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage.out + + # CPP + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage.out + + ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $ABS_METADATA > abs_metadata.json + gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + + INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $INC_METADATA > inc_metadata.json + gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + fi + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8367dc86065..83277c8c96a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -51,7 +51,7 @@ jobs: test-cpu-coverage: name: "Collect CPU test coverage" if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' - uses: ./.github/workflows/_test.yml + uses: ./.github/workflows/_coverage.yml needs: build with: docker-image: ${{ needs.build.outputs.docker-image }} @@ -64,7 +64,7 @@ jobs: test-gpu-coverage: name: "Collect GPU test coverage" if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' - uses: ./.github/workflows/_test.yml + uses: ./.github/workflows/_coverage.yml needs: build with: docker-image: ${{ needs.build.outputs.docker-image }}