From 300e979f8edfb6e0a177ba71d24ced0e8cecf037 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Wed, 20 Mar 2024 23:27:43 +0000 Subject: [PATCH 1/7] Renable AMP test on GPU CI. --- .circleci/common.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index 6fe72ec6424..74d6bb922f2 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -161,12 +161,11 @@ function run_torch_xla_python_tests() { # Following test scripts are mainly useful for # performance evaluation & comparison among different # amp optimizers. - # echo "Running ImageNet Test" - # python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 + echo "Running ImageNet Test" + python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 - # disabled per https://github.com/pytorch/xla/pull/2809 - # echo "Running MNIST Test" - # python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 + echo "Running MNIST Test" + python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 fi fi fi From 41fd9a5c0ee4019b9345fa1521e27925f811442c Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Wed, 27 Mar 2024 21:20:56 +0000 Subject: [PATCH 2/7] should only fail in op3 test. --- .circleci/common.sh | 38 +------------------------------------- test/run_tests.sh | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index 74d6bb922f2..ce56c947971 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -132,42 +132,6 @@ function run_torch_xla_python_tests() { chmod -R 755 ~/htmlcov else ./test/run_tests.sh - - # CUDA tests - if [ -x "$(command -v nvidia-smi)" ]; then - # single-host-single-process - PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 - PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - - # single-host-multi-process - num_devices=$(nvidia-smi --list-gpus | wc -l) - PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - - # single-host-SPMD - # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) - XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 8 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 - - # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) - PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 --batch_size 32 --test_set_batch_size 32 - # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) - PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1 --batch_size 32 --test_set_batch_size 32 - XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 - # Syncfree SGD optimizer tests - if [ -d ./torch_xla/amp/syncfree ]; then - echo "Running Syncfree Optimizer Test" - PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py - - # Following test scripts are mainly useful for - # performance evaluation & comparison among different - # amp optimizers. - echo "Running ImageNet Test" - python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 - - echo "Running MNIST Test" - python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 - fi - fi fi popd } @@ -240,7 +204,7 @@ function run_torch_xla_tests() { run_torch_xla_benchmark_tests $XLA_DIR else # run tests separately. - if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then + if [[ "$RUN_PYTHON_TESTS" == "python_tests" && ]]; then run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE elif [[ "$RUN_BENCHMARK_TESTS" == "benchmark_tests" ]]; then run_torch_xla_benchmark_tests $XLA_DIR diff --git a/test/run_tests.sh b/test/run_tests.sh index 840f2e25d97..8ddc0494647 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -240,6 +240,42 @@ function run_xla_op_tests3 { # NOTE: this line below is testing export and don't care about GPU PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$CDIR/test_core_aten_ops.py" run_test "$CDIR/test_pallas.py" + + # CUDA tests + if [ -x "$(command -v nvidia-smi)" ]; then + # single-host-single-process + PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 + PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + + # single-host-multi-process + num_devices=$(nvidia-smi --list-gpus | wc -l) + PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + + # single-host-SPMD + # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) + XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 8 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 + + # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) + PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 --batch_size 32 --test_set_batch_size 32 + # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) + PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1 --batch_size 32 --test_set_batch_size 32 + XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 + # Syncfree SGD optimizer tests + if [ -d ./torch_xla/amp/syncfree ]; then + echo "Running Syncfree Optimizer Test" + PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py + + # Following test scripts are mainly useful for + # performance evaluation & comparison among different + # amp optimizers. + echo "Running ImageNet Test" + python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 + + echo "Running MNIST Test" + python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 + fi + fi } ####################################################################################### From aff35f43f748dc8d1e3823dc35b3053485148a01 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 27 Mar 2024 21:23:09 +0000 Subject: [PATCH 3/7] fix typo --- .circleci/common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index ce56c947971..ce612c529a3 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -204,7 +204,7 @@ function run_torch_xla_tests() { run_torch_xla_benchmark_tests $XLA_DIR else # run tests separately. - if [[ "$RUN_PYTHON_TESTS" == "python_tests" && ]]; then + if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE elif [[ "$RUN_BENCHMARK_TESTS" == "benchmark_tests" ]]; then run_torch_xla_benchmark_tests $XLA_DIR From 7ce6274aa5e9d1b78159d7e045a50bbfaf39f14e Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Thu, 28 Mar 2024 00:20:57 +0000 Subject: [PATCH 4/7] reduce the bas to 32 --- test/run_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_tests.sh b/test/run_tests.sh index 8ddc0494647..50690b9b0ea 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -270,10 +270,10 @@ function run_xla_op_tests3 { # performance evaluation & comparison among different # amp optimizers. echo "Running ImageNet Test" - python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 + python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 --batch_size 64 echo "Running MNIST Test" - python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 + python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 --batch_size 64 fi fi } From b69198a530e16c635f39c6984410ae403598d1d6 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Thu, 28 Mar 2024 17:06:51 +0000 Subject: [PATCH 5/7] fix ci error --- test/run_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_tests.sh b/test/run_tests.sh index 50690b9b0ea..906efa5e476 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -270,10 +270,10 @@ function run_xla_op_tests3 { # performance evaluation & comparison among different # amp optimizers. echo "Running ImageNet Test" - python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 --batch_size 64 + PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 --batch_size 64 echo "Running MNIST Test" - python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 --batch_size 64 + PJRT_DEVICE=CUDA python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 --batch_size 64 fi fi } From ad24f9be8a67c54f7fa80660201aca7bd8052ac3 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Thu, 28 Mar 2024 21:37:49 +0000 Subject: [PATCH 6/7] shorten the test --- test/run_tests.sh | 8 ++++---- test/test_train_mp_imagenet_amp.py | 4 ++++ test/test_train_mp_mnist_amp.py | 9 ++++++++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/test/run_tests.sh b/test/run_tests.sh index 906efa5e476..b4755448527 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -244,12 +244,12 @@ function run_xla_op_tests3 { # CUDA tests if [ -x "$(command -v nvidia-smi)" ]; then # single-host-single-process - PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 + PJRT_DEVICE=CUDA GPU_NUM_DEVICES=1 python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 # single-host-multi-process num_devices=$(nvidia-smi --list-gpus | wc -l) - PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 + PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$num_devices python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 # single-host-SPMD @@ -270,10 +270,10 @@ function run_xla_op_tests3 { # performance evaluation & comparison among different # amp optimizers. echo "Running ImageNet Test" - PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 --batch_size 64 + PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$num_devices python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1 --batch_size 64 --num_steps=25 --model=resnet18 echo "Running MNIST Test" - PJRT_DEVICE=CUDA python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 --batch_size 64 + PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$num_devices python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1 --batch_size 64 --num_steps=25 fi fi } diff --git a/test/test_train_mp_imagenet_amp.py b/test/test_train_mp_imagenet_amp.py index 0b03f9ec849..ae1ae76126e 100644 --- a/test/test_train_mp_imagenet_amp.py +++ b/test/test_train_mp_imagenet_amp.py @@ -254,6 +254,8 @@ def train_loop_fn(loader, epoch): if step % FLAGS.log_steps == 0: xm.add_step_closure( _train_update, args=(device, step, loss, tracker, epoch, writer)) + if FLAGS.num_steps and FLAGS.num_steps == step: + break def test_loop_fn(loader, epoch): total_samples, correct = 0, 0 @@ -266,6 +268,8 @@ def test_loop_fn(loader, epoch): if step % FLAGS.log_steps == 0: xm.add_step_closure( test_utils.print_test_update, args=(device, None, epoch, step)) + if FLAGS.num_steps and FLAGS.num_steps == step: + break accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy diff --git a/test/test_train_mp_mnist_amp.py b/test/test_train_mp_mnist_amp.py index 990ea9bc91a..c087c686414 100644 --- a/test/test_train_mp_mnist_amp.py +++ b/test/test_train_mp_mnist_amp.py @@ -170,16 +170,23 @@ def train_loop_fn(loader): if step % flags.log_steps == 0: xm.add_step_closure( _train_update, args=(device, step, loss, tracker, writer)) + if FLAGS.num_steps and FLAGS.num_steps == step: + break def test_loop_fn(loader): total_samples = 0 correct = 0 model.eval() - for data, target in loader: + for step, (data, target) in enumerate(loader): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] + if step % FLAGS.log_steps == 0: + xm.add_step_closure( + test_utils.print_test_update, args=(device, None, epoch, step)) + if FLAGS.num_steps and FLAGS.num_steps == step: + break accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) From 83be659b66f2dd72c39c34d4ec0baa8c2c525a09 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Fri, 29 Mar 2024 16:51:29 +0000 Subject: [PATCH 7/7] fix typo --- test/run_tests.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/run_tests.sh b/test/run_tests.sh index b4755448527..0bfa2d12f0e 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -243,18 +243,19 @@ function run_xla_op_tests3 { # CUDA tests if [ -x "$(command -v nvidia-smi)" ]; then - # single-host-single-process + # Please keep PJRT_DEVICE and GPU_NUM_DEVICES explicit in the following test commands. + echo "single-host-single-process" PJRT_DEVICE=CUDA GPU_NUM_DEVICES=1 python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - # single-host-multi-process + echo "single-host-multi-process" num_devices=$(nvidia-smi --list-gpus | wc -l) PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$num_devices python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18 - # single-host-SPMD + echo "single-host-SPMD" # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) - XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 8 --model=resnet50 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 + XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 8 --sharding=batch --num_epochs=1 --num_steps=25 --model=resnet18 # TODO: Reduce BS due to GPU test OOM in CI after pin update to 03/05/2024 (#6677) PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1 --batch_size 32 --test_set_batch_size 32