Merge branch 'master' into hanq/pin_update

pytorch · Oct 10, 2023 · 0a079e0 · 0a079e0
2 parents 28cadf5 + 04765eb
commit 0a079e0
Show file tree

Hide file tree

Showing 34 changed files with 575 additions and 304 deletions.
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -150,10 +150,10 @@ function run_torch_xla_python_tests() {
 
       # CUDA tests
       if [ -x "$(command -v nvidia-smi)" ]; then
-        # These tests fail on CUDA with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
-        # PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        # PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
-        # XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        # These tests fail on GPU with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"

diff --git a/.kokoro/Dockerfile b/.kokoro/Dockerfile
@@ -3,7 +3,7 @@ WORKDIR /
 RUN apt-get update
 RUN apt-get -y upgrade
 RUN apt-get -y install clang time
-RUN pip install pytest tf-nightly
+RUN pip install pytest
 ARG USE_MKLDNN=0
 ARG SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
 ARG DISABLE_XRT=1
@@ -53,4 +53,4 @@ RUN time pip install -e .
 # Run tests
 ENV PJRT_DEVICE=CPU
 ENV XLA_STABLEHLO_COMPILE=1
-ENTRYPOINT pytest test/stablehlo
+ENTRYPOINT pytest test/stablehlo
diff --git a/CODEGEN_MIGRATION_GUIDE.md b/CODEGEN_MIGRATION_GUIDE.md
@@ -7,7 +7,7 @@ As PyTorch/XLA migrates to the LTC (Lazy Tensor Core), we need to clean up the e
 You should follow the instructions in [here](https://github.com/pytorch/xla/blob/master/CONTRIBUTING.md) to install required dependencies and build pytorch and pytorch/XLA from the source. You do not need access to TPU to implement the lowering. It is recommended to experiment on a workstation and configure it to use XLA:CPU. You can configure Pytorch/XLA to use XLA:CPU by running
 
 ```
-export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" XRT_WORKERS="localservice:0;grpc://localhost:51011"
+export PJRT_DEVICE=CPU
 ```
 
 It is also recommended that you're familiar with our [op lowering process](https://github.com/pytorch/xla/blob/master/OP_LOWERING_GUIDE.md) before you work on the codegen.

diff --git a/README.md b/README.md
@@ -25,10 +25,12 @@ started:
 
 ## Getting Started
 
-To install PyTorch/XLA a new VM:
+**PyTorch/XLA is now on PyPI!**
+
+To install PyTorch/XLA a new TPU VM:
 
 ```
-pip install torch~=2.0.0 https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-2.0-cp38-cp38-linux_x86_64.whl
+pip install torch~=2.1.0 torch_xla[tpu]~=2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html
 ```
 
 To update your existing training loop, make the following changes:
@@ -130,26 +132,37 @@ Our comprehensive user guides are available at:
 
 ## Available docker images and wheels
 
-### Wheel
+### Python packages
+
+PyTorch/XLA releases starting with version r2.1 will be available on PyPI. You
+can now install the main build with `pip install torch_xla`. To also install the
+Cloud TPU plugin, install the optional `tpu` dependencies:
+
+```
+pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+```
+
+GPU, XRT (legacy runtime), and nightly builds are available in our public GCS
+bucket.
 
 | Version | Cloud TPU VMs Wheel |
 | --- | ----------- |
-| 2.0 (Python 3.8) | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-2.0-cp38-cp38-linux_x86_64.whl` |
-| nightly >= 2023/04/25 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
-| nightly >= 2023/04/25 (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl` |
+| 2.1 (CUDA 12.0 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
+| 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
+| nightly (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
+| nightly (Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl` |
 
 <details>
-    <summary>older versions</summary>
+
+<summary>older versions</summary>
 
 | Version | Cloud TPU VMs Wheel |
 |---------|-------------------|
+| 2.0 (Python 3.8) | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-2.0-cp38-cp38-linux_x86_64.whl` |
 | 1.13 | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-1.13-cp38-cp38-linux_x86_64.whl` |
 | 1.12 | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-1.12-cp38-cp38-linux_x86_64.whl` |
 | 1.11 | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-1.11-cp38-cp38-linux_x86_64.whl` |
 | 1.10 | `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-1.10-cp38-cp38-linux_x86_64.whl` |
-| nightly <= 2023/04/25 |  `https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
-
-</details>
 
 <br/>
 
@@ -204,53 +217,58 @@ pip3 install torch_xla[tpuvm]
 
 This is only required on Cloud TPU VMs.
 
+</details>
+
 ### Docker
 
 | Version | Cloud TPU VMs Docker |
 | --- | ----------- |
-2.0 | `gcr.io/tpu-pytorch/xla:r2.0_3.8_tpuvm` |
-1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.8_tpuvm` |
-nightly python 3.10 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm` |
-nightly python 3.8 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_tpuvm` |
-nightly python 3.10(>= 2023/04/25) | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_YYYYMMDD` |
-nightly python 3.8(>= 2023/04/25) | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_tpuvm_YYYYMMDD` |
-nightly at date(< 2023/04/25) | `gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm_YYYYMMDD` |
+| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_tpuvm` |
+| 2.0 | `gcr.io/tpu-pytorch/xla:r2.0_3.8_tpuvm` |
+| 1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.8_tpuvm` |
+| nightly python | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm` |
 
 <br/>
 
-| Version | GPU CUDA 12.0 + Python 3.8 Docker |
+| Version | GPU CUDA 12.0 Docker |
 | --- | ----------- |
+| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.0` |
 | nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0` |
-| nightly at date(>=2023/06/27) | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0_YYYYMMDD` |
+| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.0_YYYYMMDD` |
 
 <br/>
 
-| Version | GPU CUDA 11.8 + Python 3.8 Docker |
+| Version | GPU CUDA 11.8 + Docker |
 | --- | ----------- |
+| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_11.8` |
 | 2.0 | `gcr.io/tpu-pytorch/xla:r2.0_3.8_cuda_11.8` |
 | nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8` |
-| nightly at date(>=2023/04/25) | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8_YYYYMMDD` |
-| nightly at date(<2023/04/25) | `gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8_YYYYMMDD` |
+| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_11.8_YYYYMMDD` |
 
 <br/>
 
-| Version | GPU CUDA 11.7 + Python 3.8 Docker |
+<details>
+
+<summary>older versions</summary>
+
+| Version | GPU CUDA 11.7 + Docker |
 | --- | ----------- |
 | 2.0 | `gcr.io/tpu-pytorch/xla:r2.0_3.8_cuda_11.7` |
 
 <br/>
 
-| Version | GPU CUDA 11.2 + Python 3.8 Docker |
+| Version | GPU CUDA 11.2 + Docker |
 | --- | ----------- |
 | 1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.8_cuda_11.2` |
 
 <br/>
 
-| Version | GPU CUDA 11.2 + Python 3.7 Docker |
+| Version | GPU CUDA 11.2 + Docker |
 | --- | ----------- |
-1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.7_cuda_11.2` |
-1.12 | `gcr.io/tpu-pytorch/xla:r1.12_3.7_cuda_11.2` |
+| 1.13 | `gcr.io/tpu-pytorch/xla:r1.13_3.7_cuda_11.2` |
+| 1.12 | `gcr.io/tpu-pytorch/xla:r1.12_3.7_cuda_11.2` |
 
+</details>
 
 To run on [compute instances with
 GPUs](https://cloud.google.com/compute/docs/gpus/create-vm-with-gpus).

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
@@ -203,6 +203,8 @@ only be enabled for debugging.
 * ```XLA_SAVE_TENSORS_FMT```: The format of the graphs stored within the _XLA_SAVE_TENSORS_FILE_
   file. Can be ```text``` (the default), ```dot``` (the _Graphviz_ format) or ```hlo```.
 
+* ```XLA_FLAGS=--xla_dump_to```: If set to ```=/tmp/dir_name```, XLA compiler will dump the unoptimized and optimzed HLO per compilation.
+
 * ```XLA_METRICS_FILE```: If set, the path to a local file where the internal metrics will be
   saved at every step. Metrics will be appended to the file, if already existing.
 
@@ -261,61 +263,3 @@ only be enabled for debugging.
 * ```XLA_DUMP_HLO_GRAPH```: If set to `=1` in case of a compilation or execution error the
   offending HLO graph will be dumped as part of the runtime error raised by `xla_util.cc`.
 
-### Retrieving Stack Traces
-
-In the event that the _PyTorch_ process is hanging, it might be useful to include the stack
-traces together with the GitHub issue.
-
-First thing is to find out which PID the _PyTorch_ process is associated with. Using the ```ps```
-command it is possible to find that information. It will be a _python_ process running your
-main _python_ file.
-
-In order to allow _GDB_ to attach a user process the following command should be run as root:
-
-```Shell
-echo 0 > /proc/sys/kernel/yama/ptrace_scope
-```
-
-The above command remains active until the machine is rebooted.
-
-The, given the PID, it is possible to grab the stack traces with the following command:
-
-```Shell
-./scripts/dump_stacks.py PID > /tmp/stack-traces.log
-```
-
-## Using debug_run.py To Collect Debug Information
-
-A utility is provided in `scripts/debug_run.py` which can be used to create a `tar.gz`
-archive with the information required to debug _PyTorch/XLA_ executions.
-
-Example:
-
-```Shell
-./scripts/debug_run.py --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
-```
-
-The _python_ `-u` flag is suggested to disable buffering so that captured logs are correctly
-interleaved (otherwise STDOUT will be rendered after all STDERR).
-
-The above command line example will leave the temporary folder containing the archived
-information on the filesystem. Use the `--tidy` flag to have that removed on exit:
-
-```Shell
-./scripts/debug_run.py --tidy --outfile /tmp/debug_run.tar.gz -- python -u SCRIPT [ARGS...]
-```
-
-The `debug_run.tar.gz` file should then be attached to bug reports when necessary.
-
-Since the script will collect a lot of data, it should usually be let run for no more
-than hundred steps or so.
-
-If the SCRIPT has arguments to control the number of steps, those should be used,
-otherwise hitting `CTRL^C` will interrupt the run.
-
-It is also suggested to run in single-core mode, to minimize the amount of data.
-Running in single-core mode is also strongly suggested when debugging execution issues.
-
-## Common Issues
-
-* `Missing XLA configuration` error message: You need to set `XRT_TPU_CONFIG` if using TPUs. If using GPUs set `GPU_NUM_DEVICES=N` for `N` number of GPUs. If using CPUs set `XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"` and `XRT_WORKERS="localservice:0;grpc://localhost:9002"`
diff --git a/codegen/xla_native_functions.yaml b/codegen/xla_native_functions.yaml
@@ -63,6 +63,8 @@ full_codegen:
   - log_sigmoid_forward
   - lt.Scalar
   - lt.Tensor
+  - masked_fill.Scalar
+  - masked_fill.Tensor
   - maximum
   - minimum
   - native_dropout_backward
@@ -217,8 +219,6 @@ supported:
   - log2
   - log10
   - logsumexp
-  - masked_fill.Scalar
-  - masked_fill.Tensor
   - masked_scatter
   - masked_select
   - max

diff --git a/docs/pjrt.md b/docs/pjrt.md
@@ -1,7 +1,7 @@
 # PJRT Runtime
 
 PyTorch/XLA has migrated from the TensorFlow-based XRT runtime to the [PJRT
-runtime](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla/pjrt)
+runtime](https://github.com/openxla/xla/tree/main/xla/pjrt)
 used by [JAX](https://github.com/google/jax).
 
 If you encounter a bug with PJRT, please file an issue on GitHub with the

diff --git a/infra/tpu-pytorch-releases/artifacts.auto.tfvars b/infra/tpu-pytorch-releases/artifacts.auto.tfvars
@@ -35,14 +35,14 @@ xrt_versioned_builds = [
   {
     accelerator    = "tpu"
     python_version = "3.10"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0+xrt"
   },
   {
     accelerator  = "cuda"
     python_version = "3.10"
     cuda_version = "12.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0+xrt"
   },
 ]
@@ -51,22 +51,22 @@ xrt_versioned_builds = [
 versioned_builds = [
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0"
     accelerator     = "tpu"
     bundle_libtpu   = "0"
   },
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0"
     accelerator     = "tpu"
     python_version  = "3.10"
     bundle_libtpu   = "0"
   },
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0+libtpu"
     accelerator     = "tpu"
     python_version  = "3.10"
@@ -84,26 +84,41 @@ versioned_builds = [
   },
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0",
     accelerator     = "cuda"
     cuda_version    = "12.0"
   },
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0"
     accelerator     = "cuda"
     cuda_version    = "11.8"
   },
   {
     git_tag         = "v2.1.0"
-    pytorch_git_rev = "v2.1.0-rc6"
+    pytorch_git_rev = "v2.1.0"
+    package_version = "2.1.0"
+    accelerator     = "cuda"
+    cuda_version    = "12.1"
+  },
+  {
+    git_tag         = "v2.1.0"
+    pytorch_git_rev = "v2.1.0"
     package_version = "2.1.0"
     accelerator     = "cuda"
     cuda_version    = "11.8"
     python_version  = "3.10"
   },
+  {
+    git_tag         = "v2.1.0"
+    pytorch_git_rev = "v2.1.0"
+    package_version = "2.1.0"
+    accelerator     = "cuda"
+    cuda_version    = "12.1"
+    python_version  = "3.10"
+  },
   {
     git_tag         = "v2.0.0"
     package_version = "2.0"