From ee7c18f28df207155882376035a6d38628055a7a Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Fri, 6 Dec 2024 03:40:46 +0000 Subject: [PATCH] Improve wheel naming logic in ansible This fixes #8406. The existing "Rename and append +YYYYMMDD suffix to nightly wheels" ansible action is pretty confusing since it operates on files in both pytorch/xla/dist and /tmp/staging-wheels. Inadvertently this causes the next "Add cxx11 suffix to wheels built with C++11 ABI" action to miss renaming "torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl", which means we're uploading a C++11 ABI wheel to a non-C++11 location. I've refactored the ansible actions to only operate under /tmp/staging-wheels. Under local ansible test runs: When cxx_abi=0, ansible creates these files under /dist: torch-2.6.0.dev-cp310-cp310-linux_x86_64.whl torch-2.6.0.dev20241206-cp310-cp310-linux_x86_64.whl torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl torch_xla-2.6.0.dev20241206-cp310-cp310-linux_x86_64.whl torchvision-0.19.0a0+d23a6e1-cp310-cp310-linux_x86_64.whl When cxx_abi=1, ansible creates these files under /dist: torch-2.6.0.dev.cxx11-cp310-cp310-linux_x86_64.whl torch-2.6.0.dev20241206.cxx11-cp310-cp310-linux_x86_64.whl torch_xla-2.6.0.dev.cxx11-cp310-cp310-linux_x86_64.whl torch_xla-2.6.0.dev20241206.cxx11-cp310-cp310-linux_x86_64.whl torchvision-0.19.0a0+d23a6e1.cxx11-cp310-cp310-linux_x86_64.whl The files under /dist are then uploaded to GCS. I also added documentation about C++11 ABI wheels to the README. --- README.md | 22 +++++- .../ansible/roles/build_srcs/tasks/main.yaml | 68 +++++++++++++------ 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index eac298af575..488a361dca8 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ Our comprehensive user guides are available at: ## Reference implementations -The [AI-Hypercomputer/tpu-recipies](https://github.com/AI-Hypercomputer/tpu-recipes) +The [AI-Hypercomputer/tpu-recipes](https://github.com/AI-Hypercomputer/tpu-recipes) repo. contains examples for training and serving many LLM and diffusion models. ## Available docker images and wheels @@ -195,6 +195,25 @@ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/to The torch wheel version `2.6.0.dev20240925+cpu` can be found at https://download.pytorch.org/whl/nightly/torch/. +#### Use nightly build with C++11 ABI after 10/28/2024 + +By default, `torch` is built with pre-C++11 version of ABI (see https://github.com/pytorch/pytorch/issues/51039). +`torch_xla` follows that and ships pre-C++11 builds by default. However, the lazy +tensor tracing performance can be improved by building the code with C++11 ABI. +As a result, we provide C++11 ABI builds for interested users to try, especially +if you find your model performance bottlenecked in Python lazy tensor tracing. + +You can add `.cxx11` after `yyyymmdd` to get the C++11 ABI variant of a +specific nightly wheel. Here is an example to install nightly builds from +10/28/2024: + +``` +pip3 install torch==2.6.0.dev20241028+cpu.cxx11.abi --index-url https://download.pytorch.org/whl/nightly +pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028.cxx11-cp310-cp310-linux_x86_64.whl +``` + +The torch wheel version `2.6.0.dev20241028+cpu.cxx11.abi` can be found at https://download.pytorch.org/whl/nightly/torch/. +
older versions @@ -240,6 +259,7 @@ The torch wheel version `2.6.0.dev20240925+cpu` can be found at https://download | 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_tpuvm` | | 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_tpuvm` | | nightly python | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm` | +| nightly python (C++11 ABI) | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11` | To use the above dockers, please pass `--privileged --net host --shm-size=16G` along. Here is an example: ```bash diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml index bfc5cdf4d58..f3117cabecc 100644 --- a/infra/ansible/roles/build_srcs/tasks/main.yaml +++ b/infra/ansible/roles/build_srcs/tasks/main.yaml @@ -65,16 +65,40 @@ state: directory mode: '0755' +- name: Delete any existing /dist directory + ansible.builtin.file: + path: /dist + state: absent + mode: '0755' + - name: Create /dist directory for exported wheels ansible.builtin.file: path: /dist state: directory mode: '0755' -- name: Rename and append +YYYYMMDD suffix to nightly wheels +- name: Copy wheels to staging ansible.builtin.shell: | pushd /tmp/staging-wheels cp {{ item.dir }}/*.whl . + popd + args: + executable: /bin/bash + chdir: "{{ item.dir }}" + loop: + - { dir: "{{ (src_root, 'pytorch/dist') | path_join }}", prefix: "torch" } + - { dir: "{{ (src_root, 'pytorch/xla/dist') | path_join }}", prefix: "torch_xla" } + +- name: Rename and append +YYYYMMDD suffix to nightly wheels + ansible.builtin.shell: | + # For both torch and torch_xla, we would like to have one wheel without + # date, and another copy of the same wheel with a date in the file name. + # This script normalizes wheel names to: + # + # torch_xla-2.5.0.dev-cp310-cp310-linux_x86_64.whl + # torch_xla-2.5.0.dev20240819-cp310-cp310-linux_x86_64.whl (extra copy) + # + # and so on. # rename file name like torch_xla-2.5.0+gitac7fd44-cp310-cp310-linux_x86_64.whl into # torch_xla-2.5.0.dev-cp310-cp310-linux_x86_64.whl @@ -85,20 +109,28 @@ # group 4 (can be empty): group 3 without +: `gitac7fd441`; # group 5: anything from next - to the end: `cp310-cp310-linux_x86_64.whl`. rename -v "s/^(.*?)\-(.*?)(\+([^ -]+))?\-(.+)/\1-\2.dev-\5/" *.whl - - mv /tmp/staging-wheels/* /dist/ - popd - - # rename to append the date in YYYYMMDD format - # E.g., rename file name like torch_xla-2.5.0+gitac7fd44-cp310-cp310-linux_x86_64.whl into - # torch_xla-2.5.0.dev20240819-cp310-cp310-linux_x86_64.whl - rename -v "s/^(.*?)\-(.*?)(\+([^ -]+))?\-(.+)/\1-\2.dev$(date -u +%Y%m%d)-\5/" *.whl + + # At this point, we have *.dev*.whl files. Now we want to produce the dated + # YYYYMMDD versions as a second copy without losing the .dev one. + current_date="$(date -u +%Y%m%d)" + wheels=( *.whl ) + + for f in *.whl; do + if [[ "$f" == *".dev-"* ]]; then + # Replace the first occurrence of `.dev-` with `.dev-`. + # For example: + # torch-2.6.0.dev-cp310-cp310-linux_x86_64.whl + # becomes + # torch-2.6.0.dev20241206-cp310-cp310-linux_x86_64.whl + newf="${f/.dev-/.dev${current_date}-}" + + # Copy the file to the new filename. + cp "$f" "$newf" + fi + done args: executable: /bin/bash - chdir: "{{ item.dir }}" - loop: - - { dir: "{{ (src_root, 'pytorch/dist') | path_join }}", prefix: "torch" } - - { dir: "{{ (src_root, 'pytorch/xla/dist') | path_join }}", prefix: "torch_xla" } + chdir: "/tmp/staging-wheels" when: nightly_release - name: Add cxx11 suffix to wheels built with C++11 ABI @@ -121,19 +153,13 @@ rename -v "s/^(.+?)(-cp\d+)/\1.cxx11\2/" *.whl args: executable: /bin/bash - chdir: "{{ item.dir }}" - loop: - - { dir: "{{ (src_root, 'pytorch/dist') | path_join }}", prefix: "torch" } - - { dir: "{{ (src_root, 'pytorch/xla/dist') | path_join }}", prefix: "torch_xla" } + chdir: "/tmp/staging-wheels" when: cxx11_abi | int > 0 - name: Copy wheels to /dist - ansible.builtin.shell: "cp {{ item }}/*.whl /dist" + ansible.builtin.shell: "mv /tmp/staging-wheels/* /dist/" args: executable: /bin/bash - loop: - - "{{ (src_root, 'pytorch/dist') | path_join }}" - - "{{ (src_root, 'pytorch/xla/dist') | path_join }}" - name: Delete temp directory ansible.builtin.file: