From 91ac965ad96e9b8b3ffab2af542b49b10fa06759 Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Tue, 29 Aug 2023 17:12:38 +0000
Subject: [PATCH 1/6] use manfei's change

---
 .circleci/common.sh          |  3 +++
 openxla_patches/service.diff | 14 ++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 openxla_patches/service.diff

diff --git a/.circleci/common.sh b/.circleci/common.sh
index 1642480a6b4..8983eb757a9 100755
--- a/.circleci/common.sh
+++ b/.circleci/common.sh
@@ -154,6 +154,9 @@ function run_torch_xla_python_tests() {
         # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
         # XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"
diff --git a/openxla_patches/service.diff b/openxla_patches/service.diff
new file mode 100644
index 00000000000..93ebde3bf8e
--- /dev/null
+++ b/openxla_patches/service.diff
@@ -0,0 +1,14 @@
+delete this patch after openxla pin updated to date after or equal to 08/09
+diff --git a/xla/service/layout_assignment.cc b/xla/service/layout_assignment.cc
+index f69224f42..43feda2a0 100644
+--- a/xla/service/layout_assignment.cc
++++ b/xla/service/layout_assignment.cc
+@@ -154,7 +154,7 @@ OperandLayoutConstraint::OperandLayoutConstraint(
+       instruction_(instruction),
+       operand_no_(operand_no) {
+   CHECK(shape_layout.LayoutIsSet());
+-  CHECK(ShapeUtil::CompatibleIgnoringElementType(
++  CHECK(ShapeUtil::CompatibleKind(
+       shape_layout.shape(), instruction->operand(operand_no)->shape()))
+       << shape_layout.shape() << " is not compatible with "
+       << instruction->operand(operand_no)->shape() << " (for operand "

From 2b8851e75c3ea3ce358fa47aeb75046aa41c39a6 Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Tue, 29 Aug 2023 17:55:49 +0000
Subject: [PATCH 2/6] use the new flag

---
 torch_xla/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
index cfde8456617..b9810fd4dd7 100644
--- a/torch_xla/__init__.py
+++ b/torch_xla/__init__.py
@@ -27,6 +27,7 @@ def _setup_xla_flags():
       flags, (('xla_gpu_simplify_all_fp_conversions', 'false'),))
   flags = _set_missing_flags(flags,
                              (('xla_gpu_force_compilation_parallelism', '8'),))
+  flags = _set_missing_flags(flags, (('xla_gpu_copy_insertion_use_region_analysis', 'false'),))
   os.environ['XLA_FLAGS'] = ' '.join(flags)
 
 

From 7e4d240906814d78cd6fad6e7672379d47c6926d Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 4 Oct 2023 14:47:45 -0700
Subject: [PATCH 3/6] reduce data size

---
 openxla_patches/service.diff        | 14 --------------
 test/test_train_mp_imagenet_fsdp.py |  2 +-
 torch_xla/__init__.py               |  1 -
 3 files changed, 1 insertion(+), 16 deletions(-)
 delete mode 100644 openxla_patches/service.diff

diff --git a/openxla_patches/service.diff b/openxla_patches/service.diff
deleted file mode 100644
index 93ebde3bf8e..00000000000
--- a/openxla_patches/service.diff
+++ /dev/null
@@ -1,14 +0,0 @@
-delete this patch after openxla pin updated to date after or equal to 08/09
-diff --git a/xla/service/layout_assignment.cc b/xla/service/layout_assignment.cc
-index f69224f42..43feda2a0 100644
---- a/xla/service/layout_assignment.cc
-+++ b/xla/service/layout_assignment.cc
-@@ -154,7 +154,7 @@ OperandLayoutConstraint::OperandLayoutConstraint(
-       instruction_(instruction),
-       operand_no_(operand_no) {
-   CHECK(shape_layout.LayoutIsSet());
--  CHECK(ShapeUtil::CompatibleIgnoringElementType(
-+  CHECK(ShapeUtil::CompatibleKind(
-       shape_layout.shape(), instruction->operand(operand_no)->shape()))
-       << shape_layout.shape() << " is not compatible with "
-       << instruction->operand(operand_no)->shape() << " (for operand "
diff --git a/test/test_train_mp_imagenet_fsdp.py b/test/test_train_mp_imagenet_fsdp.py
index a40f3bef74f..494dc07edd3 100644
--- a/test/test_train_mp_imagenet_fsdp.py
+++ b/test/test_train_mp_imagenet_fsdp.py
@@ -167,7 +167,7 @@ def train_imagenet():
   img_dim = get_model_property('img_dim')
   if FLAGS.fake_data:
     use_small_fake_sample = FLAGS.use_small_fake_sample
-    train_dataset_len = 50000 if use_small_fake_sample else 1200000  # Roughly the size of Imagenet dataset.
+    train_dataset_len = 5000 if use_small_fake_sample else 1200000  # Roughly the size of Imagenet dataset.
     train_loader = xu.SampleGenerator(
         data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim),
               torch.zeros(FLAGS.batch_size, dtype=torch.int64)),
diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
index b9810fd4dd7..cfde8456617 100644
--- a/torch_xla/__init__.py
+++ b/torch_xla/__init__.py
@@ -27,7 +27,6 @@ def _setup_xla_flags():
       flags, (('xla_gpu_simplify_all_fp_conversions', 'false'),))
   flags = _set_missing_flags(flags,
                              (('xla_gpu_force_compilation_parallelism', '8'),))
-  flags = _set_missing_flags(flags, (('xla_gpu_copy_insertion_use_region_analysis', 'false'),))
   os.environ['XLA_FLAGS'] = ' '.join(flags)
 
 

From 25af5a15e0a21335ee08246905bd95b3b692e1d9 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 4 Oct 2023 16:24:53 -0700
Subject: [PATCH 4/6] reduce batch size

---
 test/test_train_mp_imagenet_fsdp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_train_mp_imagenet_fsdp.py b/test/test_train_mp_imagenet_fsdp.py
index 494dc07edd3..aba3f5773e0 100644
--- a/test/test_train_mp_imagenet_fsdp.py
+++ b/test/test_train_mp_imagenet_fsdp.py
@@ -110,7 +110,7 @@
                                              transformer_auto_wrap_policy)
 
 DEFAULT_KWARGS = dict(
-    batch_size=128,
+    batch_size=96,
     test_set_batch_size=64,
     num_epochs=18,
     momentum=0.9,
@@ -167,7 +167,7 @@ def train_imagenet():
   img_dim = get_model_property('img_dim')
   if FLAGS.fake_data:
     use_small_fake_sample = FLAGS.use_small_fake_sample
-    train_dataset_len = 5000 if use_small_fake_sample else 1200000  # Roughly the size of Imagenet dataset.
+    train_dataset_len = 50000 if use_small_fake_sample else 1200000  # Roughly the size of Imagenet dataset.
     train_loader = xu.SampleGenerator(
         data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim),
               torch.zeros(FLAGS.batch_size, dtype=torch.int64)),

From 53a3bedb483a532a73d9b247bd80c021e7284b9b Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Thu, 5 Oct 2023 13:35:49 -0700
Subject: [PATCH 5/6] keep reducing batch size to 64

---
 test/test_train_mp_imagenet_fsdp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_train_mp_imagenet_fsdp.py b/test/test_train_mp_imagenet_fsdp.py
index aba3f5773e0..fdfdc8a698c 100644
--- a/test/test_train_mp_imagenet_fsdp.py
+++ b/test/test_train_mp_imagenet_fsdp.py
@@ -110,7 +110,7 @@
                                              transformer_auto_wrap_policy)
 
 DEFAULT_KWARGS = dict(
-    batch_size=96,
+    batch_size=64,
     test_set_batch_size=64,
     num_epochs=18,
     momentum=0.9,

From 80803f49ce7dd4709638afd7c7ba709b88671656 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Fri, 6 Oct 2023 15:15:26 -0700
Subject: [PATCH 6/6] remove comments

---
 .circleci/common.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.circleci/common.sh b/.circleci/common.sh
index 8983eb757a9..88c5fed8efc 100755
--- a/.circleci/common.sh
+++ b/.circleci/common.sh
@@ -151,9 +151,6 @@ function run_torch_xla_python_tests() {
       # GPU tests
       if [ -x "$(command -v nvidia-smi)" ]; then
         # These tests fail on GPU with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
-        # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        # PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
-        # XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
         XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1