Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrub use of explicit PJRT_DEVICE=CUDA for Python CI/CD and rely on autodetection #6623

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 8 additions & 13 deletions .circleci/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,23 @@ function run_torch_xla_python_tests() {
# CUDA tests
if [ -x "$(command -v nvidia-smi)" ]; then
# single-host-single-process
PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18
python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18

# single-host-multi-process
num_devices=$(nvidia-smi --list-gpus | wc -l)
PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18

# single-host-SPMD
XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18
XLA_USE_SPMD=1 torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18

PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
XLA_DISABLE_FUNCTIONALIZATION=1 python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
# Syncfree SGD optimizer tests
if [ -d ./torch_xla/amp/syncfree ]; then
echo "Running Syncfree Optimizer Test"
PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py
python test/test_syncfree_optimizers.py

# Following test scripts are mainly useful for
# performance evaluation & comparison among different
Expand Down Expand Up @@ -221,11 +221,6 @@ function run_torch_xla_tests() {
RUN_CPP="${RUN_CPP_TESTS:0}"
RUN_PYTHON="${RUN_PYTHON_TESTS:0}"

if [ -x "$(command -v nvidia-smi)" ]; then
num_devices=$(nvidia-smi --list-gpus | wc -l)
echo "Found $num_devices GPU devices..."
export GPU_NUM_DEVICES=$num_devices
fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ works only for inference now.

```
cd pytorch
PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \
python3 new_xla/benchmarks/experiment_runner.py \
--xla=PJRT \
--dynamo=openxla_eval \
--test=eval \
Expand Down
3 changes: 0 additions & 3 deletions benchmarks/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,6 @@ def run_benchmarks(args, llama_dir: str, results_dir: str,
if dynamo == 'inductor':
run_env['CUDA_VISIBLE_DEVICES'] = '0'
run_env['USE_CUDA'] = '1'
else:
run_env['PJRT_DEVICE'] = 'CUDA'
run_env['GPU_NUM_DEVICES'] = '1'

run_ok = True
with open(log_file, 'w') as f:
Expand Down
1 change: 0 additions & 1 deletion docs/gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.
In order to run below examples, you need to clone the pytorch/xla repo to access the imagenet example(We already clone it in our docker).

```
(pytorch) root@20ab2c7a2d06:/# export GPU_NUM_DEVICES=1 PJRT_DEVICE=CUDA
(pytorch) root@20ab2c7a2d06:/# git clone --recursive https://github.com/pytorch/xla.git
(pytorch) root@20ab2c7a2d06:/# python xla/test/test_train_mp_imagenet.py --fake_data
==> Preparing data..
Expand Down
1 change: 1 addition & 0 deletions test/dynamo/test_dynamo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import unittest
import warnings

xr._maybe_select_default_device()
torch_xla._XLAC._init_computation_client()

# Setup import folders.
Expand Down
4 changes: 2 additions & 2 deletions test/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ function run_coverage {
function run_test {
echo "Running in PjRt runtime: $@"
if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then
PJRT_DEVICE=CUDA run_coverage "$@"
run_coverage "$@"
else
# TODO(darisoy): run these tests with multiple CPU devices, this fails due to TF issue.
PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$@"
Expand Down Expand Up @@ -122,7 +122,7 @@ function run_torchrun {
if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then
echo "Running torchrun test for GPU $@"
num_devices=$(nvidia-smi --list-gpus | wc -l)
PJRT_DEVICE=CUDA torchrun --nnodes 1 --nproc-per-node $num_devices $@
torchrun --nnodes 1 --nproc-per-node $num_devices $@
fi
}

Expand Down
3 changes: 1 addition & 2 deletions test/test_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ def onlyIfTorchSupportsCUDA(fn):

def onlyIfPJRTDeviceIsCUDA(fn):
return unittest.skipIf(
os.environ.get("PJRT_DEVICE") not in ("GPU", "CUDA"),
reason="requires CUDA as PJRT_DEVICE")(
xr.device_type() != 'CUDA', reason="requires CUDA as PJRT_DEVICE")(
fn)


Expand Down
9 changes: 8 additions & 1 deletion torch_xla/core/xla_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
import torch_xla.utils.utils as xu
import torch_xla.utils.closures as xc

_DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices())

def _lazy_get_device():
runtime._maybe_select_default_device()
return torch_xla._XLAC._xla_get_devices()


_DEVICES = xu.LazyProperty(_lazy_get_device)

REDUCE_SUM = 'sum'
REDUCE_MUL = 'mul'
Expand Down Expand Up @@ -59,6 +65,7 @@ def __init__(self, device):

def _get_device_context(device=None):
if device is None:
runtime._maybe_select_default_device()
device = torch_xla._XLAC._xla_get_default_device()
else:
device = str(device)
Expand Down
2 changes: 2 additions & 0 deletions torch_xla/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def xla_device(n: Optional[int] = None,
Returns:
A `torch.device` representing an XLA device.
"""
_maybe_select_default_device()

if n is None:
return torch.device(torch_xla._XLAC._xla_get_default_device())

Expand Down
Loading