Add fsdp tests back to GPU CI (#5674)

* use manfei's change * use the new flag * reduce data size * reduce batch size * keep reducing batch size to 64 * remove comments
pytorch · Oct 10, 2023 · 934adc9 · 934adc9
1 parent 2db113e
commit 934adc9
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -151,9 +151,9 @@ function run_torch_xla_python_tests() {
       # GPU tests
       if [ -x "$(command -v nvidia-smi)" ]; then
         # These tests fail on GPU with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
-        # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
-        # XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"

diff --git a/test/test_train_mp_imagenet_fsdp.py b/test/test_train_mp_imagenet_fsdp.py
@@ -110,7 +110,7 @@
                                              transformer_auto_wrap_policy)
 
 DEFAULT_KWARGS = dict(
-    batch_size=128,
+    batch_size=64,
     test_set_batch_size=64,
     num_epochs=18,
     momentum=0.9,