From 79deda554c6f592b80019b1fc63a259e8d456976 Mon Sep 17 00:00:00 2001 From: Mason Chang Date: Tue, 27 Feb 2024 18:34:21 +0000 Subject: [PATCH 1/3] Scrub use of explicit PJRT_DEVICE=CUDA everywhere and rely on autodetection --- .circleci/common.sh | 21 ++++++++------------- benchmarks/README.md | 2 +- benchmarks/llama.py | 3 --- docs/gpu.md | 1 - test/dynamo/test_dynamo.py | 1 + test/run_tests.sh | 4 ++-- test/test_operations.py | 3 +-- torch_xla/core/xla_model.py | 7 ++++++- torch_xla/runtime.py | 2 ++ 9 files changed, 21 insertions(+), 23 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index 383f4cff5cc..22a7fcb692a 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -133,23 +133,23 @@ function run_torch_xla_python_tests() { # CUDA tests if [ -x "$(command -v nvidia-smi)" ]; then # single-host-single-process - PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 + python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 # single-host-multi-process num_devices=$(nvidia-smi --list-gpus | wc -l) - PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 # single-host-SPMD - XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 + XLA_USE_SPMD=1 torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 - PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 - PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1 - XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 + python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 + python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1 + XLA_DISABLE_FUNCTIONALIZATION=1 python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 # Syncfree SGD optimizer tests if [ -d ./torch_xla/amp/syncfree ]; then echo "Running Syncfree Optimizer Test" - PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py + python test/test_syncfree_optimizers.py # Following test scripts are mainly useful for # performance evaluation & comparison among different @@ -221,11 +221,6 @@ function run_torch_xla_tests() { RUN_CPP="${RUN_CPP_TESTS:0}" RUN_PYTHON="${RUN_PYTHON_TESTS:0}" - if [ -x "$(command -v nvidia-smi)" ]; then - num_devices=$(nvidia-smi --list-gpus | wc -l) - echo "Found $num_devices GPU devices..." - export GPU_NUM_DEVICES=$num_devices - fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") diff --git a/benchmarks/README.md b/benchmarks/README.md index 5b694a40158..f076a8844af 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -86,7 +86,7 @@ works only for inference now. ``` cd pytorch -PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \ +python3 new_xla/benchmarks/experiment_runner.py \ --xla=PJRT \ --dynamo=openxla_eval \ --test=eval \ diff --git a/benchmarks/llama.py b/benchmarks/llama.py index d3ca92274de..95a9aab89dc 100644 --- a/benchmarks/llama.py +++ b/benchmarks/llama.py @@ -164,9 +164,6 @@ def run_benchmarks(args, llama_dir: str, results_dir: str, if dynamo == 'inductor': run_env['CUDA_VISIBLE_DEVICES'] = '0' run_env['USE_CUDA'] = '1' - else: - run_env['PJRT_DEVICE'] = 'CUDA' - run_env['GPU_NUM_DEVICES'] = '1' run_ok = True with open(log_file, 'w') as f: diff --git a/docs/gpu.md b/docs/gpu.md index d0ce667d46f..ae516ded81c 100644 --- a/docs/gpu.md +++ b/docs/gpu.md @@ -69,7 +69,6 @@ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12. In order to run below examples, you need to clone the pytorch/xla repo to access the imagenet example(We already clone it in our docker). ``` -(pytorch) root@20ab2c7a2d06:/# export GPU_NUM_DEVICES=1 PJRT_DEVICE=CUDA (pytorch) root@20ab2c7a2d06:/# git clone --recursive https://github.com/pytorch/xla.git (pytorch) root@20ab2c7a2d06:/# python xla/test/test_train_mp_imagenet.py --fake_data ==> Preparing data.. diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py index 519eef247a7..867bdfafad8 100644 --- a/test/dynamo/test_dynamo.py +++ b/test/dynamo/test_dynamo.py @@ -14,6 +14,7 @@ import unittest import warnings +xr._maybe_select_default_device() torch_xla._XLAC._init_computation_client() # Setup import folders. diff --git a/test/run_tests.sh b/test/run_tests.sh index 6e03b179511..5cad7025691 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -56,7 +56,7 @@ function run_coverage { function run_test { echo "Running in PjRt runtime: $@" if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then - PJRT_DEVICE=CUDA run_coverage "$@" + run_coverage "$@" else # TODO(darisoy): run these tests with multiple CPU devices, this fails due to TF issue. PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$@" @@ -122,7 +122,7 @@ function run_torchrun { if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then echo "Running torchrun test for GPU $@" num_devices=$(nvidia-smi --list-gpus | wc -l) - PJRT_DEVICE=CUDA torchrun --nnodes 1 --nproc-per-node $num_devices $@ + torchrun --nnodes 1 --nproc-per-node $num_devices $@ fi } diff --git a/test/test_operations.py b/test/test_operations.py index 9e274218c4c..53465057b6f 100644 --- a/test/test_operations.py +++ b/test/test_operations.py @@ -139,8 +139,7 @@ def onlyIfTorchSupportsCUDA(fn): def onlyIfPJRTDeviceIsCUDA(fn): return unittest.skipIf( - os.environ.get("PJRT_DEVICE") not in ("GPU", "CUDA"), - reason="requires CUDA as PJRT_DEVICE")( + xr.device_type() != 'CUDA', reason="requires CUDA as PJRT_DEVICE")( fn) diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py index 28622fdafc2..b5f82675647 100755 --- a/torch_xla/core/xla_model.py +++ b/torch_xla/core/xla_model.py @@ -18,7 +18,11 @@ import torch_xla.utils.utils as xu import torch_xla.utils.closures as xc -_DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices()) +def _lazy_get_device(): + runtime._maybe_select_default_device() + return torch_xla._XLAC._xla_get_devices() + +_DEVICES = xu.LazyProperty(_lazy_get_device) REDUCE_SUM = 'sum' REDUCE_MUL = 'mul' @@ -59,6 +63,7 @@ def __init__(self, device): def _get_device_context(device=None): if device is None: + runtime._maybe_select_default_device() device = torch_xla._XLAC._xla_get_default_device() else: device = str(device) diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py index b1d01490672..94ef0783d1a 100644 --- a/torch_xla/runtime.py +++ b/torch_xla/runtime.py @@ -120,6 +120,8 @@ def xla_device(n: Optional[int] = None, Returns: A `torch.device` representing an XLA device. """ + _maybe_select_default_device() + if n is None: return torch.device(torch_xla._XLAC._xla_get_default_device()) From 8042a2afffa9f290f90ec251dbbcc640abe55ec3 Mon Sep 17 00:00:00 2001 From: Mason Chang Date: Wed, 28 Feb 2024 21:45:25 +0000 Subject: [PATCH 2/3] Fixup missing python command --- .circleci/common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index 22a7fcb692a..7dfb771b798 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -137,7 +137,7 @@ function run_torch_xla_python_tests() { # single-host-multi-process num_devices=$(nvidia-smi --list-gpus | wc -l) - test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 # single-host-SPMD From a3180b2e5d6a332b48087813825edbaf621d0c84 Mon Sep 17 00:00:00 2001 From: Mason Chang Date: Thu, 29 Feb 2024 14:25:49 +0000 Subject: [PATCH 3/3] Run YAPF on xla_model.py --- torch_xla/core/xla_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py index b5f82675647..b1c7d4c3023 100755 --- a/torch_xla/core/xla_model.py +++ b/torch_xla/core/xla_model.py @@ -18,10 +18,12 @@ import torch_xla.utils.utils as xu import torch_xla.utils.closures as xc + def _lazy_get_device(): runtime._maybe_select_default_device() return torch_xla._XLAC._xla_get_devices() + _DEVICES = xu.LazyProperty(_lazy_get_device) REDUCE_SUM = 'sum'