From 9a1f121b4ecb78dedfd4cdccc99e552d5bcb46df Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Wed, 23 Oct 2024 22:53:01 +0000 Subject: [PATCH] Add nightly C++11 ABI builds Initially, PyTorch/XLA was built with C++11 ABI, which was a more modern ABI (e.g. small string optimizations). However, that conflicts with the ABI of upstream PyTorch, so https://github.com/pytorch/xla/pull/5650 disabled the C++11 ABI. It turns out that C++11 ABI improves LazyTensor tracing performance drastically for some large models. We can't go back to C++11 ABI due to conflict with upstream, so this PR adds additional nightly build variants that enable C++11 ABI next to the existing wheels and dockers. The docker images will be tagged like "nightly_3.10_tpuvm_cxx11_20241023" instead of "nightly_3.10_tpuvm_20241023". The wheels will be named like "torch_xla-2.6.0.dev20241023.cxx11-cp310-cp310-linux_x86_64.whl" instead of "torch_xla-2.6.0.dev20241023-cp310-cp310-linux_x86_64.whl". This PR also adds support for building C++11 ABI variants for stable branches, but we don't activate it yet. This PR also removes Python 3.8 nightly builds. Fixes #8306. --- infra/ansible/config/env.yaml | 2 +- infra/ansible/config/vars.yaml | 2 ++ .../ansible/roles/build_srcs/tasks/main.yaml | 36 ++++++++++++++++++- infra/tpu-pytorch-releases/README.md | 6 +++- .../artifacts.auto.tfvars | 19 +++++++++- .../tpu-pytorch-releases/artifacts_builds.tf | 18 ++++++---- 6 files changed, 73 insertions(+), 10 deletions(-) diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml index 909e96a5189..75032fed101 100644 --- a/infra/ansible/config/env.yaml +++ b/infra/ansible/config/env.yaml @@ -32,7 +32,7 @@ build_env: XLA_SANDBOX_BUILD: 1 BAZEL_REMOTE_CACHE: "{{ lookup('env', 'BAZEL_REMOTE_CACHE') | default('1', True) }}" SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}-{{ clang_version }}{{ cache_suffix }}" - _GLIBCXX_USE_CXX11_ABI: 0 + _GLIBCXX_USE_CXX11_ABI: "{{ '1' if cxx11_abi else '0' }}" GIT_VERSIONED_XLA_BUILD: "{{ nightly_release or git_versioned_xla_build }}" amd64: diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml index 2b641fc4ac4..a77f88b2c07 100644 --- a/infra/ansible/config/vars.yaml +++ b/infra/ansible/config/vars.yaml @@ -18,3 +18,5 @@ cache_suffix: "" build_cpp_tests: 0 # Whether to tag wheels with git hash, e.g. X.Y.Z+git123abc git_versioned_xla_build: false +# Whether to use C++11 ABI when building torch and torch_xla. +cxx11_abi: false diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml index d95656209ca..e0a6f2f107d 100644 --- a/infra/ansible/roles/build_srcs/tasks/main.yaml +++ b/infra/ansible/roles/build_srcs/tasks/main.yaml @@ -82,7 +82,7 @@ # group 1: prefix stop at first -: `torch_xla` or `torch`; # group 2: anything after first - before next + or -: `2.5.0`; # group 3 (can be empty): any thing after + before next -: `+gitac7fd44`; - # group 4 (can be empty): group 3 without +: `gitac7fd441`; + # group 4 (can be empty): group 3 without +: `gitac7fd441`; # group 5: anything from next - to the end: `cp310-cp310-linux_x86_64.whl`. rename -v "s/^(.*?)\-(.*?)(\+([^ -]+))?\-(.+)/\1-\2.dev-\5/" *.whl @@ -101,6 +101,32 @@ - { dir: "{{ (src_root, 'pytorch/xla/dist') | path_join }}", prefix: "torch_xla" } when: nightly_release +- name: Add cxx11 suffix to wheels built with C++11 ABI + ansible.builtin.shell: | + # Given names like + # + # torch_xla-2.5.0.dev20240819-cp310-cp310-linux_x86_64.whl + # torch_xla-2.4.0-cp311-cp311-manylinux_2_28_x86_64.whl + # torch-2.5.0+libtpu-cp310-cp310-linux_x86_64.whl + # torch-2.5.0-cp311-cp311-linux_x86_64.whl + # + # we want to rename it to + # + # torch_xla-2.5.0.dev20240819.cxx11-cp310-cp310-linux_x86_64.whl + # torch_xla-2.4.0.cxx11-cp311-cp311-manylinux_2_28_x86_64.whl + # torch-2.5.0+libtpu.cxx11-cp310-cp310-linux_x86_64.whl + # torch-2.5.0.cxx11-cp311-cp311-linux_x86_64.whl + # + # essentially adding .cxx11 before the -cp39, -cp310, -cp311 etc identifiers. + rename -v "s/^(.+?)(-cp\d+)/\1.cxx11\2/" *.whl + args: + executable: /bin/bash + chdir: "{{ item.dir }}" + loop: + - { dir: "{{ (src_root, 'pytorch/dist') | path_join }}", prefix: "torch" } + - { dir: "{{ (src_root, 'pytorch/xla/dist') | path_join }}", prefix: "torch_xla" } + when: cxx11_abi + - name: Copy wheels to /dist ansible.builtin.shell: "cp {{ item }}/*.whl /dist" args: @@ -141,6 +167,14 @@ chdir: /dist environment: "{{ env_vars }}" +- name: Add cxx11 suffix to Torchvision wheels built with C++11 ABI + ansible.builtin.shell: | + rename -v "s/^torchvision(.+?)(-cp\d+)/torchvision\1.cxx11\2/" *.whl + args: + executable: /bin/bash + chdir: "/dist" + when: cxx11_abi + - name: Find Torchvision *.whl files in /dist ansible.builtin.find: path: /dist diff --git a/infra/tpu-pytorch-releases/README.md b/infra/tpu-pytorch-releases/README.md index 26d1ff5cbc0..f173b3ee857 100644 --- a/infra/tpu-pytorch-releases/README.md +++ b/infra/tpu-pytorch-releases/README.md @@ -49,6 +49,8 @@ consists of the following fields. * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture affects installed dependencies and build process, see [apt.yaml](../ansible/config/apt.yaml) and [pip.yaml](../ansible/config/pip.yaml). +* `cxx11_abi` (optional, "0"|"1", default = "0") - Whether to use C++11 ABI or + pre-C++11 ABI. To modify default values see `variable "versioned_builds"` in [artifacts_builds.tf](./artifacts_builds.tf). Modifying default values will modify @@ -101,6 +103,8 @@ consists of the following fields. Used only if `accelerator` is set to "cuda" * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture influences installed dependencies and build process. +* `cxx11_abi` (optional, "0"|"1", default = "0") - Whether to use C++11 ABI or + pre-C++11 ABI. Additionally, **`package_version` of all nightly builds** is configured through a separate `nightly_package_version` variable. @@ -215,4 +219,4 @@ unset properties of existing triggers. b) apply manually only the newly created resource, see - [Enforce only selected resource](https://github.com/pytorch/xla/blob/master/infra/Terraform.md#enforce-only-selected-resource) (this requires appropriate permissions in GCP). \ No newline at end of file + [Enforce only selected resource](https://github.com/pytorch/xla/blob/master/infra/Terraform.md#enforce-only-selected-resource) (this requires appropriate permissions in GCP). diff --git a/infra/tpu-pytorch-releases/artifacts.auto.tfvars b/infra/tpu-pytorch-releases/artifacts.auto.tfvars index b10b97fa96c..1acaa81be02 100644 --- a/infra/tpu-pytorch-releases/artifacts.auto.tfvars +++ b/infra/tpu-pytorch-releases/artifacts.auto.tfvars @@ -2,18 +2,35 @@ nightly_package_version = "2.6.0" # Built once a day from master. nightly_builds = [ - { accelerator = "tpu" }, { accelerator = "tpu" python_version = "3.9" + cxx11_abi = "0" }, { accelerator = "tpu" python_version = "3.10" + cxx11_abi = "0" }, { accelerator = "tpu" python_version = "3.11" + cxx11_abi = "0" + }, + { + accelerator = "tpu" + python_version = "3.9" + cxx11_abi = "1" + }, + { + accelerator = "tpu" + python_version = "3.10" + cxx11_abi = "1" + }, + { + accelerator = "tpu" + python_version = "3.11" + cxx11_abi = "1" }, { accelerator = "cuda" diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf index b10eb5e5c18..3fc50a1ae66 100644 --- a/infra/tpu-pytorch-releases/artifacts_builds.tf +++ b/infra/tpu-pytorch-releases/artifacts_builds.tf @@ -10,6 +10,7 @@ variable "nightly_builds" { python_version = optional(string, "3.8") arch = optional(string, "amd64") bundle_libtpu = optional(string, "0") + cxx11_abi = optional(string, "0") }) ) @@ -44,6 +45,7 @@ variable "versioned_builds" { cuda_version = optional(string, "11.8") arch = optional(string, "amd64") bundle_libtpu = optional(string, "0") + cxx11_abi = optional(string, "0") }) ) @@ -53,9 +55,10 @@ variable "versioned_builds" { locals { nightly_builds_dict = { for b in var.nightly_builds : - format("%s_%s", + format("%s_%s%s", b.python_version, - b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version) + b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version), + b.cxx11_abi == "1" ? "_cxx11" : "" ) => b } @@ -71,10 +74,11 @@ locals { versioned_builds_dict = { for b in var.versioned_builds : - format("r%s_%s_%s", + format("r%s_%s_%s%s", replace(b.package_version, "+", "_"), b.python_version, - b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version) + b.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", b.cuda_version), + b.cxx11_abi == "1" ? "_cxx11" : "" ) => b } } @@ -131,7 +135,8 @@ module "xrt_versioned_builds" { for_each = local.xrt_versioned_builds_dict ansible_vars = merge(each.value, { - xla_git_rev = "$COMMIT_SHA" + xla_git_rev = "$COMMIT_SHA", + cxx11_abi = each.value.cxx11_abi }) trigger_on_schedule = { schedule = "0 0 * * *", branch = "xrt" } @@ -177,7 +182,8 @@ module "versioned_builds" { // Override `pytorch_git_rev` set in each value of `versioned_builds_dict` // if it's left empty. pytorch_git_rev = coalesce(each.value.pytorch_git_rev, each.value.git_tag) - xla_git_rev = each.value.git_tag + xla_git_rev = each.value.git_tag, + cxx11_abi = each.value.cxx11_abi }) # Use Ansible setup from master branch for versioned release, because source